def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() #----------------------------------------- sents, sent_indices = getSecRankedSent(doc) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_indices = sent2Section(doc, sent_indices) summary = [] classified = [] sum_len = 0 for sent, sec_idx in zip(sents, sec_indices): #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, sent)) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx, False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) classified.append((sent, sent_val)) for sent, val in sorted(classified, key=itemgetter(1)): summary.append(sent) sum_len += len(sent.split(' ')) if sum_len > 130: break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def mainline(train=False): datadir = DIR['BASE'] + "data/" if train is True: featurefile = datadir + 'train-features.txt' xmldir = DIR['BASE'] + "demo/train/" else: featurefile = datadir + 'test-features.txt' xmldir = DIR['BASE'] + "demo/test/" deleteFiles([featurefile]) #infile = xmldir + 'C08-1122-parscit-section.xml' client_socket = getConnection() for infile in glob(xmldir + "*.xml"): try: print infile + " is being processed." if train is True: generateTrainFeatures(client_socket, infile, featurefile) else: generateTestFeatures(client_socket, infile, featurefile) except Exception as e: print "Some Exception in the main pipeline" print (str(type(e))) print str(e) logging.exception("Something awfull !!") model = DIR['DATA'] + "sec-tfidf-model.txt" if train is False: # Testing outfile = DIR['DATA'] + "sec-tfidf-test-out.txt" for gamma in [1.0]: predictSvm(featurefile, model + str(gamma), outfile) outstring = "Testing. Weight : " + str(gamma) analyze(featurefile, outfile, outstring) #pickleIt() else: # Training outfile = DIR['DATA'] + "sec-tfidf-train-out.txt" deleteFiles([outfile]) for gamma in [1.0]: #trainSvm(featurefile, model + str(gamma), gamma) trainSvm(featurefile, model, gamma) predictSvm(featurefile, model, outfile) outstring = "Training. gamma : " + str(gamma) analyze(featurefile, outfile, outstring=outstring) pickleIt()
def mainline(train=False): datadir = DIR["BASE"] + "data/" if train is True: featurefile = datadir + "train-features.txt" xmldir = DIR["BASE"] + "demo/train/" else: featurefile = datadir + "test-features.txt" xmldir = DIR["BASE"] + "demo/test/" deleteFiles([featurefile]) # infile = xmldir + 'C08-1122-parscit-section.xml' client_socket = getConnection() for infile in glob(xmldir + "*.xml"): try: print infile + " is being processed." if train is True: generateTrainFeatures(client_socket, infile, featurefile) else: generateTestFeatures(client_socket, infile, featurefile) except Exception as e: print "Some Exception in the main pipeline" print (str(type(e))) print str(e) logging.exception("Something awfull !!") model = DIR["DATA"] + "sec-tfidf-model.txt" if train is False: # TESTING outfile = DIR["DATA"] + "sec-tfidf-test-out.txt" predictSvm(featurefile, model, outfile) extractValues(outfile) outstring = "Default values Test results" analyze(featurefile, outfile, outstring=outstring) pickleIt() else: # TRAINING trainSvm(featurefile, model) outfile = DIR["DATA"] + "sec-tfidf-train-out.txt" predictSvm(featurefile, model, outfile) outstring = "Default values" analyze(featurefile, outfile, outstring=outstring)
def mainline(train=False): datadir = DIR['BASE'] + "data/" if train is True: featurefile = datadir + 'train-features.txt' xmldir = DIR['BASE'] + "demo/train/" else: featurefile = datadir + 'test-features.txt' xmldir = DIR['BASE'] + "demo/test/" deleteFiles([featurefile]) #infile = xmldir + 'C08-1122-parscit-section.xml' client_socket = getConnection() for infile in glob(xmldir + "*.xml"): try: print infile + " is being processed." if train is True: generateTrainFeatures(client_socket, infile, featurefile) else: generateTestFeatures(client_socket, infile, featurefile) except Exception as e: print "Some Exception in the main pipeline" print(str(type(e))) print str(e) logging.exception("Something awfull !!") model = DIR['DATA'] + "sec-tfidf-model.txt" if train is False: # TESTING outfile = DIR['DATA'] + "sec-tfidf-test-out.txt" predictSvm(featurefile, model, outfile) extractValues(outfile) outstring = "Default values Test results" analyze(featurefile, outfile, outstring=outstring) pickleIt() else: # TRAINING trainSvm(featurefile, model) outfile = DIR['DATA'] + "sec-tfidf-train-out.txt" predictSvm(featurefile, model, outfile) outstring = "Default values" analyze(featurefile, outfile, outstring=outstring)
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees( getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)