def getUnwanted(outfile): dir = DIR['BASE'] + "demo/" os.chdir(dir) samples = '' total = 0 num = 12 samples = '' for file in glob("*.xml"): try: doc = Document(file) sentences, offset = doc.all_sentences() # Ranker ranker = TextRank(sentences) ranker.rank() scores = sorted(ranker.scores, key=itemgetter(1)) for x in range(num): idx = scores[x][0] + offset samples += doc[idx].sentence.encode('utf-8') + '\n' total += 1 print(file + " : Done") except Exception as e: print(file + str(e)) # for now this is the location of the file writeToFile(outfile, samples, 'w') print ("Total number of files processed successfully : " + str(total))
def summarize_secitons(document, sections, coef=0.8): logit(document) doc = Document(document) all_sentences, all_offset = doc.all_sentences() summ = [] for section_name in sections: sec_sentences, sec_offset = doc.section_sentences(section_name) limit = len(sec_sentences) # Ranker ranker = SectionMMR(all_sentences) ranker.rank(sec_offset=sec_offset, limit=limit, coef=coef) sentencs = ranker.scores summary = [] for x in range(num): idx = sentencs[x][0] + sec_offset sent = doc[idx].sentence summary.append((sent, sentencs[x][1], doc.get_section_name(idx))) summ.append(sent) text = "" logit("\nSection : " + section_name) for sent, score, section in summary: text += "\n" + sent.encode("utf-8") logit(text) file = DIR["BASE"] + "data/Summary.txt" with open(file, "w") as sfile: sfile.write("\n".join(summ).encode("utf-8"))
def get_pos_sentences(infile, outfile, backup=False): doc = Document(infile) #sentences, o = doc.all_sentences() #ranker = Ranker(sentences, tfidf=False) #----------------------------------------- # Instead of the above, now sentences will be clubbed into sections and # passed to the ranker, which is to be returned sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- sent, offset = doc.section_sentences('abstract') sent_idx = range(offset, offset + len(sent)) samples = '\n'.join(sent) writeToFile(outfile, samples, 'w') #return ranker, sent_idx # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------Positive---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx
def run(self,q=Query()): ''' dispatch all argoments into log.txt ''' log=Document("log.txt") story=log.get_params() for i in story: self.dispatch(story[i]+"\n")
def edit( self, text, description=None, text_format=None ): """Edit the News Item. """ if text_format is None: text_format = getattr(self, 'text_format', 'structured-text') if description is not None: self.setDescription( description ) Document.edit( self, text_format, text )
def _edit( self, text, description=None, text_format=None ): """ Edit the News Item """ if text_format is None: text_format = getattr(self, 'text_format', 'html') if description is not None: self.setDescription( description ) Document._edit( self, text_format, text )
def __init__(self, doc, win = None, referer = None, lastModified = None, cookie = ''): Document.__init__(self, doc) self._win = win self._referer = referer self._lastModified = lastModified self._cookie = cookie self._html = None self.current = None
def edit(self, text_format, text, file='', REQUEST=None): """ Edit the discussion item. """ Document.edit(self, text_format, text, file) if REQUEST is not None: return self.editForm(self, REQUEST, portal_status_message= \ 'Discussion item changed.')
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) #feature_string += processTree(tree, count_ranker, sent_idx, True) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx, False) #feature_string += processTree(tree, count_ranker, sent_idx, True) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "All input files processed to create feature vectors for training."
def __init__(self, doc, win = None, referer = None, lastModified = None, cookie = ''): Document.__init__(self, doc) self._win = win self._referer = referer self._lastModified = lastModified self._cookie = cookie self._html = None self._domain = urlparse(self._win.url).hostname if self._win else '' self.current = None
class PubmedArticleSet(handler.ContentHandler): def __init__(self): handler.feature_external_ges = "false" self.docs = {} self.doc = None self.chars = "" def startElement(self, name, attr): if name == 'PubmedArticle' or name == 'PubmedBookArticle': self.doc = Document() self.chars = "" def endElement(self, name): if name == 'PubmedArticle': self.docs[self.doc.pmid] = self.doc if name == 'PMID' and self.doc.pmid == None: self.doc.pmid = self.text() if name == 'ArticleTitle': self.doc.title = self.text() if name == 'AbstractText': if self.doc.abstract == None: self.doc.abstract = self.text() else: self.doc.abstract += self.text() if name == 'DescriptorName': self.doc.addMeSH(self.text()) def characters(self, data): self.chars += data def text(self): return self.chars.strip().encode('ascii', 'ignore') ## Method to parse a PubmedArticleSet XML file. # @param location The location of the xml file to parse # return A PubmedArticleSet object @classmethod def parse(self, location): parser = make_parser() parser.setFeature("http://xml.org/sax/features/external-general-entities", False) parser.setFeature("http://xml.org/sax/features/external-parameter-entities", False) handler = PubmedArticleSet() parser.setContentHandler(handler) try: f = open(location, 'r') parser.parse(f) f.close() except Exception, e: raise RuntimeError, "Could not parse PubmedArticleSet XML file at %s" % location return handler
def test2(): cnf = QConfiguration() cnf.get_configuration('.','exam.cfg') print cnf._title print cnf._course print cnf._date doc = Document(cnf) doc.save('/home/javier/tmp', 'test')
def __init__(self, doc, win=None, referer=None, lastModified=None, cookie=""): Document.__init__(self, doc) self._win = win self._referer = referer self._lastModified = lastModified self._cookie = cookie self._html = None self._readyState = "loading" self._domain = urlparse(self._win.url).hostname if self._win else "" self.current = None self.__init_personality()
def get_test_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 7 x = 0 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x += 1 #if not validSentence(doc[idx]): # continue #else: # sent_idx.append(idx) # samples += doc[idx].sentence.encode('utf-8') + '\n' # num -= 1 sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 #--------------------------------------------------- # Storing the sentence in the dictionary for pickling for display infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1) key = infi + "-" + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx, sent_idx
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() #----------------------------------------- sents, sent_indices = getSecRankedSent(doc) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_indices = sent2Section(doc, sent_indices) summary = [] classified = [] sum_len = 0 for sent, sec_idx in zip(sents, sec_indices): #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, sent)) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx, False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) classified.append((sent, sent_val)) for sent, val in sorted(classified, key=itemgetter(1)): summary.append(sent) sum_len += len(sent.split(' ')) if sum_len > 130: break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def createDocumentWithAbstract(self, document_id, document_year, document_title, document_author, currentFile, bucketId): # Create a document document_path = self.root_path + "/nips" + bucketId document = Document(document_id, document_year, document_title, document_author, currentFile, document_path) # Create an Abstract abstract_text = self.getAbstract(document) abstract = Abstract(document.document_id, document.year, abstract_text) # Link a Document to it's abstract document.abstract = abstract # Store the Document and the abstract in memory. self.documents.append(document) self.abstracts.append(abstract)
def search(self, query): results = [] query_document = Document(query) query_stems = query_document.get_stems() documents = self.corpus.get_documents() for doc in documents: document_id = doc.get_id() score = 0.0 stemmed_document = doc.get_stems() for qstem in query_stems: if qstem in stemmed_document: term_frequency = self.metrics.get_term_frequency(document_id, qstem) score += term_frequency if score > 0.0: results.append({"id": doc.get_id(), "score": score, "text": doc.get_text()}) return results
def getAbstracts(outfile): dir = DIR['BASE'] + "demo/" os.chdir(dir) samples = '' total = 0 for file in glob("*.xml"): try: doc = Document(file) sent, offset = doc.section_sentences('abstract') samples += '\n'.join(sent) print(file + " : Done") total += 1 except Exception as e: print(file + str(e)) # for now this is the location of the file writeToFile(outfile, samples, 'w') print ("Total number of files processed successfully : " + str(total))
def __init__( self, keyword, value = None, equation = '', units = None, label = None, display = None): Document.__init__(self) self.keyword = keyword self.value = value self.equation = equation self.units = units self.label = label self.display = display self.variables = {} for keyword in EquationParser()(self.equation): self.variables[keyword] = None if self.isJustValue(): self.set(eval(self.equation))
def summarize(document, all=True): doc = Document(document) sentences, offset = (doc.all_sentences() if all else doc.filtered_sentences()) # Ranker ranker = TextRank(sentences) ranker.rank() scores = ranker.scores # Selector summary = [] sum_len = 0 for x in range(num): idx = scores[x][0] + offset sent = doc[idx].sentence if sum_len + len(sent.split(' ')) > MAXLEN: break summary.append((sent, scores[x][1], doc.get_section_name(idx))) sum_len += len(sent.split(' ')) text = '' logit("\nP10-1024") logit("\nAll Sentences" if all else "\nFiltered Sentences") logit("Length of summary : " + str(sum_len)) for sent, score, section in summary: text += '\n' + "[" + section.encode('utf-8') + "] " + \ sent.encode('utf-8') #"[" + str(score) + "] " + sent.encode('utf-8') logit(text) # Printer # this has to be automated file = DIR['BASE'] + "data/Summary.txt" with open(file, 'w') as sfile: sfile.write('\n'.join([sent for sent, sc, sec in summary]). encode('utf-8')) # Evaluator guess_summary_list = [file] ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]] recall, precision, F_measure = PythonROUGE(guess_summary_list, ref_summary_list, ngram_order=1) logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision, F_measure))
def __init__(self): ''' Initialization of object Document for configuration data and all default info ''' self.config=Document("config.txt") self.default_host='www.medcordex.eu' self.default_user='******' self.default_passwd='sagitta' self.default_db='medcordex'
def generateTestFeatures(infile): doc = Document(infile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ all_sentences, all_offset = doc.all_sentences() ranker = TextRank(all_sentences) ranker.rank() num = 7 x = 0 test_sents = [] sent_indices = [] while num > 0: idx = ranker.scores[x][0] + all_offset x += 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) test_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 #------------------------------------------------ # For display and analysis key = fcode + '-' + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} #----------------------------------------- for sentence, sent_idx in zip(test_sents, sent_indices): key = fcode + '-' + str(sent_idx) print key print test_data[key]['contextpre'] print "----Main sentence Start----" print test_data[key]['sentence'] print "----Main sentence End----" print test_data[key]['contextpos'] feature_string = raw_input() feature_string += '1' test_data[key]['reallbl'] = feature_string
def show_descriptions(self): ''' Check the language's option and a operation's name into argoments of the current class. Open the file into Manual with the selected language's option, and find the operation's name. Dispatch the descfiption of operation's name. ''' try: language=self.getLanguage(self.args)+".txt" path=os.path.dirname(os.path.abspath(__file__))+"/Manual/" #check if language exists into path if os.path.exists(os.path.join(path,language)): doc=Document(language,path) #otherwise raise exception else : raise Exception("No language founded with name : "+language) for a in self.args: description=str(doc.get_parameter(a)) if description!="": self.dispatch(description) else: raise Exception("Wrong operation or No describe implementation for this operation") except Exception as e: self.dispatch(e)
def main(): clustered_corpus_path = 'clustered_corpus' clustered_corpus = read_clustered_corpus(clustered_corpus_path) corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus) target_file_path = 'target.txt' text = read_text_file(target_file_path) document = Document(text) corpus = Corpus(corpus) clustered_corpus = ClusteredCorpus(clustered_corpus) candidate_to_rank_mapping = {} candidate_to_params_mapping = {} candidate_to_dfs_in_each_cluster_mapping = {} for candidate in document.get_candidates(): tf = math.log(1.0 + document.get_tf_for(candidate), 10.0) # tf = document.get_tf_for(candidate) idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0) cu = clustered_corpus.get_cu_for(candidate) rank = cu # rank = tf * cu # rank = tf * idf dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate) candidate_representative = corpus.get_representative_for(candidate) candidate_to_rank_mapping[candidate_representative] = rank candidate_to_params_mapping[candidate_representative] = (tf, idf, cu) candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster table = generate_table_based_on( candidate_to_rank_mapping, candidate_to_params_mapping, candidate_to_dfs_in_each_cluster_mapping ) save_as_file(table) print('Done.')
def read_clustered_corpus(path): result = [] for directory in os.listdir(path): cluster = [] for file in os.listdir(os.path.join(path, directory)): text_file = read_text_file(os.path.join(path, directory, file)) document = Document(text_file) cluster.append(document) result.append(cluster) return result
def __init_context(self): """ Spidermonkey Context initialization. """ document = Document(self) self.__dict__['__cx'] = self.__dict__['__rt'].new_context(alertlist = []) self.__dict__['__sl'] = [] self.__dict__['__fl'] = [document] self.__init_properties(document) self.__init_methods() self.__finalize_context()
def map(self, line): #find cluster assignment by brute force doc = Document(line) cluster_uid = None sqdist_to_nearest = float('inf') for cluster_k in self.clusters: sqdist_k = MathUtil.compute_distance(map1 = cluster_k.tfidf, map2 = doc.tfidf, squared=True) if sqdist_k <= sqdist_to_nearest: cluster_uid = cluster_k.uid #dutifully emit. self.emit(key = cluster_uid, value = doc) return
def _get_arxiv_publications_as_documents(self, n, keyword="data"): #récupère n publications de arxiv.org url = 'http://export.arxiv.org/api/query?search_query=all:' + keyword + '&start=0&max_results=' + str( n) data = xmltodict.parse(urllib.request.urlopen(url).read()) pubs = [] #si un seul document est requêté, le format est différent if n == 1: data["feed"]["entry"] = [data["feed"]["entry"]] for pub in data["feed"]["entry"]: pubs.append(Document.factory("Arxiv", pub)) return pubs
def Initialize(): #Creating the trie object and passing to both the functions T = Trie() #Initialising the stopwords to trie T = StopWord.initialize(T) #Initialising the data by scraping and then loading it to trie data structure T = Document.initializeData(T) #Fully loaded Trie data structure is returned return T
def get(self, domain, name): ''' Retrieve a document :param domain: Name of the domain to get document from :param name: Name of the document to retrieve (within domain) :return: Document objects ''' doc_id = self.__engine.get_document_id(domain, name) if doc_id is None: raise KeyError("Document doesn't exist: %s \ %s" % (domain, name)) return Document(self.__engine, doc_id)
def map(self, line): # TODO: call `self.emit(key, value)` instance = Document(line) min_dist = sys.maxsize key = -1 for cluster in self.clusters: dist = MathUtil.compute_distance(map1=cluster.tfidf, map2=instance.tfidf) if dist < min_dist: key = cluster.uid min_dist = dist self.emit(key, line) #instance.__str__()
def TermFrequency(word): TF = {} wordDic = {} docData = Document.docData k = Document.Search(word) for key, val in k.items(): wordDic[key] = len(val) TF[key] = len(val) / docData[key] if len(val) > 0: Ranking.docDic = Ranking.docDic + 1 return TF
def main(): #read in training documents and documents to classify documentList = createDocuments(sys.argv[1]) trainDocs = documentList[0] sampleDocs = documentList[1] #read in stopwords with open("stopwords.txt", "r") as f: stopwords = Document.tokenize(f.read()) #classify the documents with missing authors attributor = Attributor(trainDocs, sampleDocs, stopwords) attributor.train() attributor.classify() writeup = Writeup() results = attributor.get_results() writeup.print_accuracy(sampleDocs, results) writeup.print_confusion_matrix(sampleDocs, results) print featureRankings = attributor.get_feature_ranking() for i in range(0, 20): print featureRankings[i][0], featureRankings[i][1] print featureFrequencies = attributor.get_feature_frequencies() featurePlotDataX = [] featurePlotDataY = [] for numFeatures in range(10, len(featureFrequencies) + 1, 10): newStopwords = [ featureFrequencies[i][0] for i in range(0, numFeatures) ] newAttributor = Attributor(trainDocs, sampleDocs, newStopwords) newAttributor.train() newAttributor.classify() newResults = newAttributor.get_results() accuracy = writeup.get_accuracy(sampleDocs, newResults) featurePlotDataX.append(numFeatures) featurePlotDataY.append(accuracy) print "Feature curve:" for i in range(len(featurePlotDataX)): print featurePlotDataX[i], featurePlotDataY[i] plt.plot(featurePlotDataX, featurePlotDataY) plt.xlabel("Number of Features") plt.ylabel("Accuracy") plt.title("Accuracy vs. Number of Features") plt.axis([0, 450, -0.1, 1.1]) plt.show()
def save_doc(): global reading_index if request.method == 'POST': # get time spent editting in seconds time_spent_editting = time.time() - edit_start_time if len(request.form['changes_dict']) == 0: changes = {} else: changes = eval(request.form['changes_dict']) button = request.form['button'] print('changes', changes) # open document filetime = reading_times[reading_index] # open the already editted document if it exists to_save_to = editor_folder + os.sep + time_to_filename(filetime, extension='hocr') print('saving_to', to_save_to) if os.path.isfile(to_save_to): filepath = to_save_to else: filepath = sess.dir_name + os.sep + source_dirs[source_dir_index] + os.sep + time_to_filename(filetime, extension='hocr') doc = Document(filepath, output_dir=editor_folder) if 'seconds_spent_editting' in doc.attrs: time_spent_editting += eval(doc.attrs['seconds_spent_editting']) doc.attrs['seconds_spent_editting'] = time_spent_editting # record the path of the file that was editted if not os.path.isfile(to_save_to): doc.attrs['editted_from_path'] = filepath # get the words with the same order and filters as the page img_path = sess.dir_name + os.sep + settings.frame_images_dir + os.sep + time_to_filename(filetime, extension='jpg') img = mpimg.imread(img_path) all_words = word_list(doc, img.shape) # make changes for id_key in changes: index = int(id_key) all_words[index].text = changes[id_key][0] all_words[index].attrs['highlight'] = changes[id_key][1] all_words[index].attrs['editted_by_human'] = 'True' # save changes doc.save() #iterate index if button == 'Next': if reading_index + 1 < len(reading_times): reading_index += 1 return redirect('/doc?reading_index={}'.format(reading_index)) else: return redirect('/') else: if reading_index > 0: reading_index -= 1 return redirect('/doc?reading_index={}'.format(reading_index)) else: return redirect('/') # if no data was sent, go home return redirect('/')
def build(self): self.builder = gtk.Builder() self.builder.add_from_file(os.path.join(self.datadir, 'ui', 'snippets.ui')) handlers_dic = { 'on_dialog_snippets_response': self.on_dialog_snippets_response, 'on_dialog_snippets_destroy': self.on_dialog_snippets_destroy, 'on_button_new_snippet_clicked': self.on_button_new_snippet_clicked, 'on_button_import_snippets_clicked': self.on_button_import_snippets_clicked, 'on_button_export_snippets_clicked': self.on_button_export_snippets_clicked, 'on_button_remove_snippet_clicked': self.on_button_remove_snippet_clicked, 'on_entry_tab_trigger_focus_out': self.on_entry_tab_trigger_focus_out, 'on_entry_tab_trigger_changed': self.on_entry_tab_trigger_changed, 'on_entry_accelerator_focus_out': self.on_entry_accelerator_focus_out, 'on_entry_accelerator_focus_in': self.on_entry_accelerator_focus_in, 'on_entry_accelerator_key_press': self.on_entry_accelerator_key_press, 'on_source_view_snippet_focus_out': self.on_source_view_snippet_focus_out, 'on_tree_view_snippets_row_expanded': self.on_tree_view_snippets_row_expanded, 'on_tree_view_snippets_key_press': self.on_tree_view_snippets_key_press} self.builder.connect_signals(handlers_dic) self.build_tree_view() self.build_model() image = self['image_remove'] image.set_from_stock(gtk.STOCK_REMOVE, gtk.ICON_SIZE_SMALL_TOOLBAR) source_view = self['source_view_snippet'] manager = get_language_manager() lang = manager.get_language('snippets') if lang: source_view.get_buffer().set_highlight_syntax(True) source_view.get_buffer().set_language(lang) self.snippets_doc = Document(None, source_view) combo = self['combo_drop_targets'] combo.set_text_column(0) entry = combo.child entry.connect('focus-out-event', self.on_entry_drop_targets_focus_out) entry.connect('drag-data-received', self.on_entry_drop_targets_drag_data_received) lst = entry.drag_dest_get_target_list() lst = gtk.target_list_add_uri_targets(entry.drag_dest_get_target_list(), self.TARGET_URI) entry.drag_dest_set_target_list(lst) self.dlg = self['dialog_snippets'] if self.default_size: self.dlg.set_default_size(*self.default_size)
def __init__(self): ''' In initialization, the class save a reference of DBlink class and two references of Document class, one for info of last query and one to alias. ''' self.dblink=DBlink() #the lastquery document must be into personal path (into HOME for Linux and MacOS, into Documents for Windows) self.lastquery=Document("lastquery.txt") #the alias document must be into class' path path=os.path.dirname(os.path.abspath(__file__)) self.alias=KL_Document("alias.txt",path) #default table for mecordex user self.default_table='MEDCORDEX'
def __init__(self, vectorSize=100, windowSize=5): super() self.document = Document() if not Path(c.doc2VecModel).exists(): docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(self.document.docList)] print(docs) self.model = Doc2Vec(vector_size=vectorSize, window=windowSize, min_count=5, workers=4, epochs=40, alpha=0.025) self.model.build_vocab(docs) self.model.train(docs, total_examples=self.model.corpus_count, epochs=self.model.epochs) self.model.save("./doc2VecModel") else: self.model = Doc2Vec.load(c.doc2VecModel)
def newDiagram(self, widget, data=None): newDocument = Document(self.tabsPanel.get_current_page() + 1) scrollArea = gtk.ScrolledWindow() scrollArea.set_policy(gtk.POLICY_ALWAYS, gtk.POLICY_ALWAYS) scrollArea.add_with_viewport(newDocument) n = self.tabsPanel.append_page( scrollArea, gtk.Label("Diagram %d" % (self.tabsPanel.get_n_pages() + 1))) scrollArea.show_all() self.tabsPanel.set_current_page(n) self.documentManager.documents.append(newDocument)
def get_neg_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 5 x = -1 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x -= 1 if not validSentence(doc[idx]): continue else: sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------Negative---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx
def main(): #read in training documents and documents to classify documentList = createDocuments( sys.argv[ 1 ] ) trainDocs = documentList[ 0 ] sampleDocs = documentList[ 1 ] #read in stopwords with open( "stopwords.txt" , "r" ) as f: stopwords = Document.tokenize( f.read() ) #classify the documents with missing authors attributor = Attributor( trainDocs , sampleDocs , stopwords ) attributor.train() attributor.classify() writeup = Writeup() results = attributor.get_results() writeup.print_accuracy( sampleDocs , results ) writeup.print_confusion_matrix( sampleDocs , results ) print featureRankings = attributor.get_feature_ranking() for i in range( 0 , 20 ): print featureRankings[ i ][ 0 ] , featureRankings[ i ][ 1 ] print featureFrequencies = attributor.get_feature_frequencies() featurePlotDataX = [] featurePlotDataY = [] for numFeatures in range(10,len(featureFrequencies)+1,10): newStopwords = [ featureFrequencies[ i ][ 0 ] for i in range( 0 , numFeatures ) ] newAttributor = Attributor( trainDocs , sampleDocs , newStopwords ) newAttributor.train() newAttributor.classify() newResults = newAttributor.get_results() accuracy = writeup.get_accuracy( sampleDocs , newResults ) featurePlotDataX.append( numFeatures ) featurePlotDataY.append( accuracy ) print "Feature curve:" for i in range(len(featurePlotDataX)): print featurePlotDataX[ i ] , featurePlotDataY[ i ] plt.plot( featurePlotDataX , featurePlotDataY ) plt.xlabel( "Number of Features" ) plt.ylabel( "Accuracy" ) plt.title( "Accuracy vs. Number of Features" ) plt.axis( [0, 450, -0.1, 1.1] ) plt.show()
def main(): filename = input('Please input a filename: ') fileA = Document(filename) title = fileA.generateWhole() wordlist = fileA.wordlist o = 'The time required to do top 50 using dictionary: \n' worddict = BasicStats.createFreqMap(wordlist) n = 50 a = time.time() topdict = BasicStats.topN(worddict, int(n)) b = time.time() o += str(b - a) + '\n' o += 'The time required to do top 50 using heap: \n' c = time.time() k = BasicStats.HTopNBottomN(worddict, int(n)) d = time.time() o += str(d - c) + '\n' o += '\nMax 50\n' for i in range(1, 51): o += str(k[1][i]) + ' ' + str(k[0][i]) + '\n' o += '\nMin 50\n' for i in range(1, 51): o += str(k[3][i]) + ' ' + str(k[2][i]) + '\n' lista = [[], []] for i in topdict: lista[0] += [i] #words lista[1] += [topdict[i]] #frequency graph = CommandLinePlotter.Scatter2D(lista[1]) timefile = open('Top50TIMEFILE' + '-' + filename, 'wt', encoding='UTF-8') for j in o: timefile.write(j) timefile.close()
def __init__(self, dataDir, wordToIdMap, wordList): self.D = 0 # The number of documents # self.clusterNoArray = [] self.documents = [] with open(dataDir) as input: line = input.readline() while line: self.D += 1 obj = json.loads(line) text = obj['textCleaned'] document = Document(text, wordToIdMap, wordList, int(obj['tweetId'])) self.documents.append(document) line = input.readline() print("number of documents is ", self.D)
def map(self, line): # Key is cluster id - clusters stored in self.clusters # Value is the line dist = float("inf") temp_dist = float("inf") doc = Document(line) key = doc.uid for c in self.clusters: temp_dist = MathUtil.compute_distance(doc.tfidf,c.tfidf) if temp_dist < dist: dist = temp_dist key = c.uid self.emit(str(key),str(doc))
def four_frames_test(): # make directories original_pic_dir = 'tests/four-frames/original-pictures' dir_for_bigger_images = 'tests/four-frames' + os.sep + settings.images_ready_for_ocr if not os.path.isdir(dir_for_bigger_images): os.mkdir(dir_for_bigger_images) dir_for_hocr = 'tests/four-frames' + os.sep + settings.hocr_dir if not os.path.isdir(dir_for_hocr): os.mkdir(dir_for_hocr) dir_for_xml = 'tests/four-frames' + os.sep + settings.xml_dir if not os.path.isdir(dir_for_xml): os.mkdir(dir_for_xml) # make initial run through the images for filename in os.listdir(original_pic_dir): # resize full_path = original_pic_dir + os.sep + filename full_path_for_new_image = dir_for_bigger_images + os.sep + filename initial_ocr.resize_image(full_path, full_path_for_new_image, redo=True, part='digital reading') # run tesseract full_path_for_hocr = dir_for_hocr + os.sep + filename initial_ocr.run_tesseract_on_image(full_path_for_new_image, full_path_for_hocr, redo=True) # make corrections correct_bags = ocr_cleanup.get_correct_bags() word_to_doc = ocr_cleanup.make_matching_dictionary(correct_bags) ocr_cleanup.cleanup_hocr_files(dir_for_hocr, dir_for_xml, correct_bags, word_to_doc) # find differences for filename in os.listdir(dir_for_xml): full_path = dir_for_xml + os.sep + filename doc = Document(full_path) lines = [str(l).strip() for l in doc.lines if len(str(l).strip()) > 0] filename_with_txt_ending = filename[:-len('png.hocr')] + 'txt' path_to_correct_lines_file = 'tests/four-frames' + os.sep + 'limited-correct-output-text' + os.sep + filename_with_txt_ending with open(path_to_correct_lines_file, 'r') as infile: correct_lines = [line.strip() for line in infile] if len(lines) != len(correct_lines): raise Exception( 'lines has length {0} but correct_lines has length {1} for {2}' .format(len(lines), len(correct_lines), filename)) for i in range(len(lines)): if lines[i] != correct_lines[i]: raise Exception( 'lines[{0}] has value\n{1}\n but correct_lines[{0}] has value\n{2}\n for {3}' .format(i, lines[i], correct_lines[i], filename)) print('Four frames test passed')
def load_data(src_file, tgt_file): docs = [] with open(src_file, 'r', encoding='utf-8') as src_reader, \ open(tgt_file, 'r', encoding='utf-8') as tgt_reader: for src_line, tgt_line in zip(src_reader, tgt_reader): src_line = src_line.strip() tgt_line = tgt_line.strip() if src_line == "" or tgt_line == "": docs.append(None) continue src_sents = src_line.split('##SENT##') tgt_sents = tgt_line.strip().split('##SENT##') docs.append(Document(src_sents, tgt_sents)) return docs
def __init__(self): super() self.document = Document() self.nodes = self.document.nodeList self.docs = [item.lower().split() for item in self.document.rawDocList] if Path('word2vecTrained.model').exists(): self.word2Vec = Word2Vec.load('word2vecTrained.model') else: self.word2Vec = Word2Vec(self.docs, size=100, window=4, min_count=1, workers=3) self.word2Vec.save('word2vecTrained.model')
def test_zip(self): self.d.append({"a": ["a"]}) self.d.append({"a": ["aa"]}) self.d.append({"a": ["aaa"]}) d2 = Document() d2.append({"b": ["b"]}) d2.append({"b": ["bb"]}) joined = Document.zip(self.d, d2) self.assertEqual(list(joined), [{"a": ["a"], "b": ["b"]}, {"a": ["aa"], "b": ["bb"]}, {"a": ["aaa"]}])
def __init__(self, manualFilepath=config.manualPath, k=60): documents = Document(manualFilepath) docs = documents.docList self.Doc = documents self.docTree = documents.tree self.nodes = documents.nodeList self.uniqueTerms = getAllUniqueTerms(docs) self.allTopics = documents.allTopics termDocMatrix = getTermDocMatrix(self.uniqueTerms, docs) u, s, vh = np.linalg.svd(termDocMatrix, full_matrices=False) S = np.diag(s) # # ersetzen durch variable K self.uk = u[:, :k] self.Sk = S[:k, :k] self.vhk = vh[:k, :]
def load_data(path=""): time.sleep(0.5) print("start to load data from path----->", path) time.sleep(0.5) file_list = os.listdir(path) sentences = list() for i in range(len(file_list)): filename = file_list[i] current_path = os.path.join(path, filename) document = Document(filename=current_path) for sentence in document.sentence_list: sentences.append(sentence) return sentences
def map(self, line): # TODO: Your code goes here -- call `self.emit(key, value)` doc = Document(line) shortest = float('inf') # current cluster lable of this data point cur_center = 999 for cluster in self.clusters: dist_temp = self.l2_norm(doc.tfidf, cluster.tfidf) if dist_temp < shortest: shortest = dist_temp cur_center = cluster.uid self.emit(str(cur_center), str(doc))
def summarize_secitons(document, sections): logit(document) doc = Document(document) all_sentences, all_offset = doc.all_sentences() summ = [] for section_name in sections: sec_sentences, sec_offset = doc.section_sentences(section_name) # Ranker ranker = TextRank(sec_sentences) ranker.rank() sentencs = ranker.scores summary = [] for x in range(num): idx = sentencs[x][0] + sec_offset sent = doc[idx].sentence summary.append((sent, sentencs[x][1], doc.get_section_name(idx))) summ.append(sent) text = '' logit("\nSection : " + section_name) for sent, score, section in summary: text += '\n' + sent.encode('utf-8') logit(text) file = DIR['BASE'] + "data/Summary.txt" with open(file, 'w') as sfile: sfile.write('\n'.join(summ).encode('utf-8')) # Evaluator guess_summary_list = [file] ref_summary_list = [[DIR['BASE'] + "data/P10-1024-Ref1.txt"]] recall, precision, F_measure = PythonROUGE(guess_summary_list, ref_summary_list, ngram_order=1) logit("Recall:{0} ; Precision:{1} ; F:{2}".format(recall, precision, F_measure))
def predict(test_data): with open('predictions.csv', 'w', encoding='utf-8') as output: writer = csv.writer(output) writer.writerow([ 'document', 'predict_class', 'predict_score', 'exp_predict_score' ]) for instance in test_data.iterrows(): doctext = instance[1]['document'] doc = Document(doctext) predict_clas = max(stats.classes, key=lambda c: _compute_score(doc, c)) predict_score = _compute_score(doc, predict_clas) exp_predict_score = np.exp(predict_score) writer.writerow( [doctext, predict_clas, predict_score, exp_predict_score])
def search(self): logger = logging.getLogger("qa_logger") logger.info("%s:\tDocument Retrieval", self.id_q) search_engines = self._get_search_engines() try: num = int(MyConfig.get("document_retrieval", "n_results")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) num = 10 results = [] for engine in search_engines: try: results += engine.search(self.query, count=num) except Exception as e: logger = logging.getLogger("qa_logger") logger.error("Problem with search engine.") logger.debug(e) sys.exit(1) doc_list = [] # rank loops over [0..num-1] rank = 0 # ignore repeated urls unique_urls = set() for resource in results: if resource.url in unique_urls: continue unique_urls.add(resource.url) # rank+1 loops over [1..num] # rank+1 is the relative position of the results doc_list.append(Document(resource, rank + 1)) rank = (rank + 1) % num try: if MyConfig.get("persistence", "document") == "True": output = open("documentos.pkl", "wb") pickle.dump(doc_list, output, 0) output.close() except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) return doc_list
def __init__(self, docCSV): self.doc = Document() self.allHeadlines = self.doc.allTopics self.questions = [] self.expectedPages = [] self.expectedTopicHeadline = [] with open(docCSV) as f: reader = csv.reader(f) # Skip Header next(reader) questions = [] for row in reader: self.expectedTopicHeadline.append(row[0]) self.expectedPages.append(list(map(int, row[1].split(",")))) questions.append(list(map(str.strip, row[2:]))) self.questions = questions
def setUp(self): self.test_metadata = Metadata() self.test_text = ("Here is some test text. Blah blah blah blah \n" + "1234567890987654321 Yea Alabama Drown 'em Tide!\n") self.test_filename = "test_superdoc.txt" self.test_document = Document(self.test_metadata, self.test_text, self.test_filename) self.test_metadata_list = ([ self.test_metadata, self.test_metadata, self.test_metadata ]) self.test_superdoc_text = self.test_text * 3 #print self.test_superdoc_text self.test_superdoc = SuperDocument(self.test_metadata_list, self.test_superdoc_text, self.test_filename) self.assertEqual(len(self.test_superdoc.component_metadata), 3)
def handleMetaTable(self, data): data.pop(0) # 忽略表头 data.sort(key=itemgetter(0)) x, y = 120, 100 hInterval = 50 vInterval = 0 for row in data: if x > 1400: x = 120 y = y + vInterval + 50 vInterval = 0 clsitem = Document.create_clsitem(row[1], (x, y)) w, h = clsitem.get_size() x = x + w + hInterval if vInterval < h: vInterval = h
def setUpClass(cls): logger.debug('Parsing documents.') cls.parser = Parser() setup = CompareXml() for file in os.listdir('data'): try: tree = cls.parser.parse_file('data/' + file) root = tree.getroot() form = cls.parser.find_tag(root, 'formular') form_id = str(cls.parser.get_attribute(form).get('id')) contract_number = cls.parser.find_tag(root, 'v_vertragsnummer') setup.documents[file] = Document(form_id, contract_number, root) except etree.ParseError as e: logger.error('File ' + file + ' cannot be parsed.\n' + str(e)) setup.check_preconditions()