def applyfilters(self, variables, filterscs): ''' This method applies the filters to the files by taking in the two list, one of which contains the files checked to apply, the other one contains the filters checked to apply, and this method will change the state string accordingly ''' if GUI.fileObj == []: for thefile in GUI.fileName: GUI.fileObj.append(Document(thefile)) for D in GUI.fileObj: D.generateWhole() elif len(GUI.fileObj) != len(GUI.fileName): lengthD = len(GUI.fileName) - len(GUI.fileObj) for i in range(lengthD)[::-1]: GUI.fileObj.append(Document(GUI.fileName[-(1 + i)])) GUI.fileObj[-1].generateWhole() empty = True for i in range(len(variables)): if variables[i].get() == 1: dofil = TextFilter(GUI.fileObj[i]) dofil.apply([ self.filters[j] for j in range(len(filterscs)) if filterscs[j].get() == 1 ], GUI.fileObj[i]) empty = False if empty == True: self.sti = 'Nothing applied' else: self.sti = 'Successfully applied' self.sti_text.set(self.sti)
def read_newdata(self,dfile,wordmapfile): word2id = {} id2_id = {} self.read_wordmap1(wordmapfile,word2id) # 从wordmapfile中读入word2id if len(word2id) <= 0 : print("No word map available!\n") return 1 fin = open(dfile) if fin == None : print("Cannot open file ",dfile," to read!\n") return 1 line = fin.readline() # get the number of new documents self.M = int(line) if self.M <= 0 : print("No document available!\n") return 1 self.V = 0 for i in range(self.M) : line = fin.readline() strtok = Strtokenizer(line,' \t\r\n') length = strtok.count_tokens() doc = [] _doc = [] for j in range(length) : found = False for key in word2id.keys(): if strtok.token(j) == key: found = True break if found: # 找到 key found2 = False for value in id2_id.values() : if value == word2id[strtok.token(j)] : found2 = True break if not found2 : # 没找到 value _id = len(id2_id) id2_id[word2id[strtok.token(j)]] = _id self._id2id[_id] = word2id[strtok.token(j)] else : # 找到 _id = word2id[strtok.token(j)] doc.append(word2id[strtok.token(j)]) _doc.append(_id) else: # 没找到 tmp = '' # 没决定要做什么 pdoc = Document(doc) _pdoc = Document(_doc) self.add_doc(pdoc,i) self._add_doc(_pdoc,i) fin.close() self.V = len(id2_id) return 0
def read_file(self, file): #read in a single file from corpus #parse to sentence class and word class but fully document class doc_to_return = Document() lines = file.readlines() pre_sent = [] sent_index = 0 is_offset = 1 doc_list = [] for line in lines: if not line.startswith('#'): if line.isspace(): doc_to_return.sentences.append( Sentence(pre_sent.copy(), sent_index)) pre_sent.clear() sent_index += 1 else: entries = line.split() # Set Doc Id if not doc_to_return.docID: doc_to_return.get_doc_id(entries[0]) # Construct word word = Word() word.set_entries(entries) pre_sent.append(word) # Create mapping for resolving difference in ISNotes offset and OntoNotes #doc_to_return.coref_to_ISoffset[is_offset] = (doc_to_return.header, sent_index, int(entries[2])) #is_offset += 1 elif line.startswith('#begin'): doc_to_return = Document() doc_to_return.header = line else: doc_to_return.end = line doc_list.append(doc_to_return) sent_index = 0 doc_to_return = Document() # for sent in doc_to_return.sentences: # sent.get_names() # May no longer be needed. since exists in make_mentions in sentence_init # Construct gold coreference clusters # for doc in doc_list: # for sent in doc.sentences: # for m in sent.mentions: # if m.gold_cluster_id is not None: # doc.gold_clusters[m.gold_cluster_id].append(m) return doc_list
def load_dataset(): documents = [] with open(references_path) as reference_file: reference_dict = json.load(reference_file) with open(cases_path) as json_file: cases_json = json.load(json_file) for index, case in enumerate(cases_json): application_number = case['appno'] document_name = case['docname'] document_id = case['itemid'] multiple_appnos = False # Alle keys hvor application number indgår matching_key = [ key for key in reference_dict.keys() if application_number in key.split(';') ] multi_refs = [] temp_internal_appnos = [] for k in matching_key: multi_refs.extend(reference_dict[k].keys()) split = k.split(';') temp_internal_appnos.extend(split) # alle outgoing refs fra "matching keys" multi_refs = remove_dups(multi_refs) # alle appnos i hver "matching key" temp_internal_appnos = remove_dups(temp_internal_appnos) document = Document.Document(application_id=application_number, document_id=document_id, title=document_name, references_appno=multi_refs, internal_appnos=temp_internal_appnos, multiple_appnos=multiple_appnos ) # related_appnos=remaining_appnos documents.append(document) print(f'Loaded document: {index}') for count, doc in enumerate(documents): print(f'reffing doc {count}') # tjekker alle refs (internal og outgoing) for outgoing in doc.all_refs: for other_doc in documents: # hvis appno er i andet docs internal appno --> ref fra doc til doc if outgoing in other_doc.internal_appnos: doc.outgoing_refs[other_doc.document_id] = 1 break documents = add_articles_and_conclusion_to_documents(documents) adjacency_matrix = create_adjacency_matrix_from_references(documents) assign_pagerank_to_documents(documents, adjacency_matrix) return documents
def finditer(self, doc): """ doc can be a Document object or a plain text to convert into one each iteration is an array of words HINT: if you would rather have a (start,end) doc range, do (result[0].location[0],result[-1].location[-1]) """ if type(doc) == str or type(doc) == unicode: doc = Document.Document(doc) words = doc.words() endi = len(words) - len(self.matchers) + 1 for i in range(endi): matched = [] for j in range(i, endi): for m in self.matchers: if not m.match(words[j]): matched = None break matched.append(words[j]) if matched == None: break if matched != None: yield matched
def add_dataset(self, folder_path): ds = Dataset(folder_path) for doc in ds.get_contents(): document = Document(text=doc, features=self.features) document.analysis = self.analyze_source(source=document) self.documents.append(document)
def buildDocumentCollectionRegex(self, chemin_doc): collection = {} file = open(chemin_doc) doc = file.read() file.close() docs = doc.split(".I") for d in range(1, len(docs)): doc = Document.Document() id = re.search(r'(\d*|$)', docs[d][1:]) value = re.search(r'\.W(.*?)\.', docs[d], re.DOTALL) doc.setId(id.group(0)) if value is not None: doc.setTexte(value.group(1).replace("\n", ' ')) else: doc.setTexte("") collection[id.group(0)] = doc return collection
def buildDocCollectionSimple2(self, chemin_doc): file = open(chemin_doc, "r") lignes = file.readlines() length = len(lignes) collection = {} i = 0 while i < length: key = "" value = "" if ".I" in lignes[i]: key = lignes[i][3:-1] doc = Document.Document() doc.setId(key) i += 1 while ".T" not in lignes[i]: i += 1 i += 1 while i < length and "." not in lignes[i]: value += lignes[i] i += 1 doc.setTexte(value[:-1]) collection[key] = doc else: i += 1 file.close() return collection
def import_txt(self, txt_file): temp_txt = [] for line in txt_file: if len(line.rstrip().split("\t")) > 1: label = line.rstrip().split("\t")[0] if label != '__label__neu': sentence = line.rstrip().split("\t")[1] doc = Document.Document(sentence, label) self.docs.append(doc) temp_txt.append(doc) label_num = 0 if doc.label == '__label__pos': label_num = 0 # positive label elif doc.label == '__label__neg': label_num = 1 # negative label self.label_counts[label_num] += 1 for word in doc.words: if word != '': if word not in self.words: value = WordValue() self.words.append(word) self.values.append(value) self.values[int(word)].appear[label_num] += 1 pass
def createDocument(self, namespaceURI, qualifiedName, doctype): import Document doc = Document.Document(doctype) if qualifiedName: el = doc.createElementNS(namespaceURI, qualifiedName) doc.appendChild(el) return doc
def buildDocumentCollectionRegex(fichier): """ Construit l'index a partir d'une base de documents contenus dans fichier, On lit le fichier en entier et on utilise des expressions régulières pour récupère le contenu des balises :type fichier: String :param fichier: Le fichier qui contient les documents que l'on veut indexé :return: Un dictionnaire de d'object Document, dont les clef sont les id des Document. {"id1": Document1, "id2": Document2, ...} """ resultat = dict() f = open(fichier, 'r') doc = f.read() docs = doc.split(".I") for di in range(1, len(docs)): d = Document.Document() id_doc = re.search(r'(\d*|$)', docs[di][1:]).group(0) d.setID(int(id_doc)) m = re.search(r'\.T(.*?)\.', docs[di], re.DOTALL) if m is not None: d.setTexte(m.group(1).replace('\n', ' ')) else: d.setTexte("") resultat[id_doc] = d f.close() return resultat
def readDocuments(self): i = 0 while i < self.numFiles: self.docDictionary[i] = Document(self.fileDictionary[i]) self.docDictionary[i].generateWhole() self.wordListDictionary[i] = self.docDictionary[i].getWordList() i += 1 print("Read Documents.")
def getDocument(self, text): docId = re.search("\.I\s(.*)", text).group(1) docTitle = re.search("\.T\s(.*)", text).group(1) docText = re.search("\.W\s(.*)", text).group(1) docDate = re.search("\.B\s(.*)", text).group(1) doc = Doc.Document(docId, docTitle+"\n"+docText+"\n"+docDate, others=dict()) return doc
def find_coreferences(input_file, output_dir): doc = Document.Document(input_file, output_dir) tags = doc.tags for anaphor_idx in range(len(tags)): # Finding string matches if tags[anaphor_idx].content.lower() not in references.pronouns: for antecedent_idx in range(anaphor_idx - 1, -1, -1): if (head_word_match(tags[anaphor_idx].content, tags[antecedent_idx].content) or overlap_similarity(tags[anaphor_idx].content, tags[antecedent_idx].content) >= .5 or tags[anaphor_idx].content.lower() in find_acronyms( tags[antecedent_idx].content) or tags[antecedent_idx].content.lower() in find_acronyms(tags[anaphor_idx].content)): tags[anaphor_idx].ref = tags[antecedent_idx].id if not tags[antecedent_idx].ref: tags[antecedent_idx].ref = tags[anaphor_idx].id break # Look for pronouns else: for antecedent_idx in range(anaphor_idx - 1, max(-1, anaphor_idx - 10), -1): if tags[anaphor_idx].content.lower( ) == tags[antecedent_idx].content.lower(): tags[anaphor_idx].ref = tags[antecedent_idx].id if not tags[antecedent_idx].ref: tags[antecedent_idx].ref = tags[anaphor_idx].id elif pronoun_attribute_match( tags[anaphor_idx].content.lower(), tags[antecedent_idx].content.lower()): tags[anaphor_idx].ref = tags[antecedent_idx].id if not tags[antecedent_idx].ref: tags[antecedent_idx].ref = tags[anaphor_idx].id unmatched_tags = [t for t in doc.tags if not t.ref] counter = 0 for tag in unmatched_tags: TAG_RE = re.compile('(?:^|[\s])(' + re.escape(tag.content.lower().split()[-1]) + ')(?:$|[\.\s\,\!\?])') matches = re.findall(TAG_RE, doc.content) if matches: for match in matches: new_id = 'A' + str(counter) doc.tags.append(Document.Tag(new_id, tag.id, match)) tag.ref = new_id counter += 1 doc.save()
def getAllDocuments(): files = os.listdir("./Docs") for i in range(len(files)): files[i] = "./Docs/" + files[i] docs = [] for single_file in files: docs.append(Document(single_file)) docs = sorted(docs) return docs
def getTechDocuments(): files = os.listdir("./Docs3/Technology") for i in range(len(files)): files[i] = "./Docs3/Technology/" + files[i] docs: List[Document.Document] = [] for single_file in files: x = Document.Document(single_file, 0) docs.append(x) docs = sorted(docs) return docs
def run(self): for text in self.documents_list: document = Document(text) summary = document.get_summary(5) filename = text.split('\\')[-1] filename = filename[:filename.index(".")] with codecs.open(filename + ".txt", "w", "utf-8-sig") as summary_file: summary_file.write(summary) summary_file.close()
def docs(self, fileids=None): """ Returns a list of Document objects containing all of the documents in the files specified by the fileids parameter. """ result = [] for fileid in self.abspaths(fileids): dom = parse(fileid) for doc in dom.getElementsByTagName('Document'): d = Document(doc) result.append(d) return result
def add_source(self, data, kind): if SourceType.query == kind: query = Query(text=data, features=self.features) query.analysis = self.analyze_source(source=query) self.queries.append(query) elif SourceType.document == kind: document = Document(text=data, features=self.features) document.analysis = self.analyze_source(source=document) document.document_id = len(self.documents) self.documents.append(document)
def rankFile(self, filename, measure='DRDC'): """Score a file rather than an entire RDG. NOT SUPPORTED!""" ranking = [] d = Document(filename=filename, overwrite=overwrite) for w in d.counts: temp = self.metrics[measure](w) for s in [w]: ## Filter.unstem(w): ranking.append((s, temp)) ## # force ranks to be [0,1] ranking.sort(key=lambda x: x[1], reverse=True) return ranking
def getDocs( self ): #reads files in the library file directory and converts to Document. if not os.path.exists('docfiles'): os.makedirs('docfiles') for filename in os.listdir( 'docfiles'): # get the directory '.' + 'os.sep' + if filename.endswith('.txt'): with open('docfiles' + os.sep + filename, "rt") as f: contents = f.read() doctags = [] docContents = [] pagetags = [] convertedTagList = [] pages = [] section = contents.split("(/)")[1:] try: assert (section[0].startswith("Title:")) assert (section[1].startswith("Doctags:")) assert (section[2].startswith("Pages:")) assert (section[3].startswith("Pagetags:")) except AssertionError: print("Attempted to read invalid document.") continue title = section[0][len("Title:"):].strip() if not section[1][len("Doctags:"):].isspace(): doctagstr = section[1][len("Doctags:"):].strip() for tag in doctagstr.split(","): doctags.append(Tag(tag.strip())) if not section[2][len("Pages:"):].isspace(): pagestr = section[2][len("Pages:"):].strip() for elem in pagestr.split("<pwords>"): docContents.append(str(elem.strip())) pagetagstr = section[3][len("Pagetags:"):].strip() for elem in pagetagstr.split("<tname>"): if not elem.isspace(): pagetags.append(elem.strip( )) # pagetags is a list with a string of tags for taglist in pagetags: convertedPageTags = [] for tag in taglist.split(","): if not tag.isspace() and not tag == "": convertedPageTags.append(Tag(tag.strip())) convertedTagList.append(convertedPageTags) makeTime = os.path.getctime editTime = os.path.getmtime for i in range(len(docContents)): pages.append( Page(self, docContents[i], convertedTagList[i])) self.documents.append( Document(self, filename, title, f"{makeTime}", f"{editTime}", doctags, pages))
def Refresh(self): """ This method allows up to update all data when this tab is displayed. This is necessary as the objects may have been changed since they were originally loaded. """ try: # If a Library Object is defined, reload it if self.seriesObj != None: self.seriesObj = Library.Library(self.seriesObj.number) # If an Episode Object is defined, reload it if self.episodeObj != None: self.episodeObj = Episode.Episode(self.episodeObj.number) # if a Documetn Object is defined, reload it if self.documentObj != None: self.documentObj = Document.Document(self.documentObj.number) # If a Collection Object is defined, reload it if self.collectionObj != None: self.collectionObj = Collection.Collection( self.collectionObj.number) # If a Clip Object is defined, reload it. if self.clipObj != None: self.clipObj = Clip.Clip(self.clipObj.number) # If a Quote Object is defined, reload it. if self.quoteObj != None: self.quoteObj = Quote.Quote(self.quoteObj.number) # Get the local keyword list pointer aimed at the appropriate source object. # NOTE: If a Clip is defined use it (whether an episode is defined or not.) If # no clip is defined but an episode is defined, use that. if self.clipObj != None: self.kwlist = self.clipObj.keyword_list elif self.episodeObj != None: self.kwlist = self.episodeObj.keyword_list elif self.documentObj != None: self.kwlist = self.documentObj.keyword_list elif self.quoteObj != None: self.kwlist = self.quoteObj.keyword_list # Update the Tab Display self.UpdateKeywords() except TransanaExceptions.RecordNotFoundError: msg = _( "The appropriate Keyword data could not be loaded from the database." ) if not TransanaConstants.singleUserVersion: msg += '\n' + _( "This data may have been deleted by another user.") tmpDlg = Dialogs.ErrorDialog(self.parent, msg) tmpDlg.ShowModal() tmpDlg.Destroy() # Return to the database tab self.parent.parent.ControlObject.ShowDataTab(0)
def __init__(self): ''' This function inherits from the GUI class and constructs the interface frames for textfilters, which contains three frames that leads to three frames doing separate works The three frames are the fileframe filterframe and a state frame that tells whether the filters are being applied ''' super().__init__(root) self.forget() self.variables = [] self.fileFrame = LabelFrame(root, text='Files') self.files() self.fileFrame.grid(row=2, column=0, columnspan=2, sticky=W + E + N + S) self.filterframe = LabelFrame(root, text='Text Filters, Check to Apply') self.filters() self.filterframe.grid(row=2, column=2, columnspan=2, sticky=N) self.stateFrame = LabelFrame(root, text='State') self.state() self.stateFrame.grid(row=2, column=4, columnspan=1, sticky=W + E + N + S) if GUI.fileObj == []: for thefile in GUI.fileName: GUI.fileObj.append(Document(thefile)) for D in GUI.fileObj: D.generateWhole() elif len(GUI.fileObj) != len(GUI.fileName): lengthD = len(GUI.fileName) - len(GUI.fileObj) for i in range(lengthD)[::-1]: GUI.fileObj.append(Document(GUI.fileName[-(1 + i)])) GUI.fileObj[-1].generateWhole()
def calculate_MRR(document_directory_location, gold_label_directory_location, stopwords, k, alpha, num_iterations, window): Reciprocal_ranks = [] for root, dirs, files in os.walk(document_directory_location): for file in files: if p.exists(gold_label_directory_location + file): doc = Document(document_directory_location + file, gold_label_directory_location + file, stopwords) doc.Preprocess() doc.Create_word_graph(window) rec = doc.get_reciprocal_rank(k, alpha, num_iterations) #print(file,rec) Reciprocal_ranks.append(rec) mrr = sum(Reciprocal_ranks) / len(Reciprocal_ranks) print('Window =', window, "K =", k, 'MRR =', mrr)
def extractWithTemplate(self, template, xOffset=0, yOffset=0): if not template: return None document = Document() for templateBox in template.boxes: rect = QRectF(templateBox.rect) rect.translate(xOffset, yOffset) text = self.textInRegion(rect, templateBox.recognizer) text = self.filter(text, templateBox.filter) documentBox = DocumentBox() documentBox.name = templateBox.name documentBox.text = text documentBox.templateBox = templateBox document.addBox(documentBox) return document
def normalize(): docs_name = [] for doc in glob.glob("*.persian_poem"): docs_name.append(doc) docs_content = [] for doc in docs_name: raw_input = open(doc, "r").read() docs_content.append(raw_input) docs = [] for index, doc in enumerate(docs_name): doc_obj = Document.Document(doc, docs_content[index]) docs.append(doc_obj) return docs
def fill_doc_list_train(file_path, amount=0): documents_list = [] docs_splitted = [] with open(file_path, 'r', encoding='utf8') as infile: for line in infile: docs_splitted.append(line) if not amount: if (len(docs_splitted)) == amount: break for doc in docs_splitted: single_doc = doc.split("\t") new_doc = Dc.Document(single_doc[0], single_doc[1], single_doc[2]) documents_list.append(new_doc) # list of Document class objects return documents_list
def normalize(documents): queries_name = [] for query in glob.glob("*.persian_query"): queries_name.append(query) queries_content = [] for query in queries_name: raw_input = open(query, "r").read() queries_content.append(raw_input) queries = [] for index, query in enumerate(queries_name): query_obj = Document.Document(query, queries_content[index], documents) queries.append(query_obj) return queries
def import_docs(self, docs_file): temp_docs = [] for line in docs_file: if len(line.rstrip().split(":")) > 1: label = line.rstrip().split(":")[0] if label != 'neu': sentence = line.rstrip().split(":")[1] doc = Document.Document(sentence, label) self.docs.append(doc) temp_docs.append(doc) if doc.label == 'pos': label_num = 0 # positive label elif doc.label == 'neg': label_num = 1 # negative label self.label_counts[label_num] += 1 for word in doc.words: if word != '': self.values[int(word)].appear[label_num] += 1 self.docs_set.append(temp_docs)
def findMatchingTemplateByOffset(self, templates, offset=5): max = 0 best = { 'template': None, 'document': Document(), 'xOffset': 0, 'yOffset': 0 } for template in templates: if not template.boxes: continue # Consider up to 5 millimeter offset for xOffset in range(-5, 6): for yOffset in range(-5, 6): score = 0 matcherBoxes = 0 currentDocument = self.extractWithTemplate( template, xOffset, yOffset) for documentBox in currentDocument.boxes: templateBox = documentBox.templateBox if documentBox.templateBox.type != 'matcher': print "Jumping %s due to type %s" % ( templateBox.name, templateBox.type) continue matcherBoxes += 1 similarity = Trigram.trigram(documentBox.text, templateBox.text) score += similarity if matcherBoxes: score = score / matcherBoxes if score > max or not matcherBoxes: max = score best = { 'template': template, 'document': currentDocument, 'xOffset': xOffset, 'yOffset': yOffset } print "Template %s has score %s with offset (%s,%s)" % ( template.name, score, xOffset, yOffset) return best