Python Document.Document 예제들, Document.Document, Tautulli Python 예제들

예제 #1

0

파일 보기

파일: GUI.py 프로젝트: fuxing4203/DecisionTree

 def applyfilters(self, variables, filterscs):
     '''
     This method applies the filters to the files by taking in the two list, one of which contains
     the files checked to apply, the other one contains the filters checked to apply, and this
     method will change the state string accordingly
     '''
     if GUI.fileObj == []:
         for thefile in GUI.fileName:
             GUI.fileObj.append(Document(thefile))
         for D in GUI.fileObj:
             D.generateWhole()
     elif len(GUI.fileObj) != len(GUI.fileName):
         lengthD = len(GUI.fileName) - len(GUI.fileObj)
         for i in range(lengthD)[::-1]:
             GUI.fileObj.append(Document(GUI.fileName[-(1 + i)]))
             GUI.fileObj[-1].generateWhole()
     empty = True
     for i in range(len(variables)):
         if variables[i].get() == 1:
             dofil = TextFilter(GUI.fileObj[i])
             dofil.apply([
                 self.filters[j]
                 for j in range(len(filterscs)) if filterscs[j].get() == 1
             ], GUI.fileObj[i])
             empty = False
     if empty == True:
         self.sti = 'Nothing applied'
     else:
         self.sti = 'Successfully applied'
     self.sti_text.set(self.sti)

예제 #2

0

파일 보기

    def read_newdata(self,dfile,wordmapfile):
        word2id = {}
        id2_id = {}
        self.read_wordmap1(wordmapfile,word2id)   # 从wordmapfile中读入word2id
        if len(word2id) <= 0 :
            print("No word map available!\n")
            return 1
        fin = open(dfile)
        if fin == None :
            print("Cannot open file ",dfile," to read!\n")
            return 1
        line = fin.readline()
        # get the number of new documents
        self.M = int(line)
        if self.M <= 0 :
            print("No document available!\n")
            return 1
        self.V = 0

        for i in range(self.M) :
            line = fin.readline()
            strtok = Strtokenizer(line,' \t\r\n')
            length = strtok.count_tokens()
            doc = []
            _doc = []
            for j in range(length) :
                found = False
                for key in word2id.keys():
                    if strtok.token(j) == key:
                        found = True
                        break
                if found:   # 找到 key
                    found2 = False
                    for value in id2_id.values() :
                        if value == word2id[strtok.token(j)] :
                            found2 = True
                            break
                    if not found2 : # 没找到 value
                        _id = len(id2_id)
                        id2_id[word2id[strtok.token(j)]] = _id
                        self._id2id[_id] = word2id[strtok.token(j)]
                    else :  # 找到
                        _id = word2id[strtok.token(j)]
                    doc.append(word2id[strtok.token(j)])
                    _doc.append(_id)
                else:           # 没找到
                    tmp = ''
                    # 没决定要做什么

            pdoc = Document(doc)
            _pdoc = Document(_doc)
            self.add_doc(pdoc,i)
            self._add_doc(_pdoc,i)

        fin.close()
        self.V = len(id2_id)
        return 0

예제 #3

0

파일 보기

파일: CoreReader.py 프로젝트: cpalenmichel/Master-Thesis

    def read_file(self, file):
        #read in a single file from corpus
        #parse to sentence class and word class but fully document class
        doc_to_return = Document()
        lines = file.readlines()
        pre_sent = []
        sent_index = 0
        is_offset = 1
        doc_list = []
        for line in lines:
            if not line.startswith('#'):
                if line.isspace():
                    doc_to_return.sentences.append(
                        Sentence(pre_sent.copy(), sent_index))
                    pre_sent.clear()
                    sent_index += 1
                else:
                    entries = line.split()
                    # Set Doc Id
                    if not doc_to_return.docID:
                        doc_to_return.get_doc_id(entries[0])

                    # Construct word
                    word = Word()
                    word.set_entries(entries)
                    pre_sent.append(word)
                    # Create mapping for resolving difference in ISNotes offset and OntoNotes
                    #doc_to_return.coref_to_ISoffset[is_offset] = (doc_to_return.header, sent_index, int(entries[2]))
                    #is_offset += 1
            elif line.startswith('#begin'):
                doc_to_return = Document()
                doc_to_return.header = line
            else:
                doc_to_return.end = line
                doc_list.append(doc_to_return)
                sent_index = 0
                doc_to_return = Document()

        # for sent in doc_to_return.sentences:
        #     sent.get_names()  # May no longer be needed. since exists in make_mentions in sentence_init

        # Construct gold coreference clusters
        # for doc in doc_list:
        #     for sent in doc.sentences:
        #         for m in sent.mentions:
        #             if m.gold_cluster_id is not None:
        #                 doc.gold_clusters[m.gold_cluster_id].append(m)

        return doc_list

예제 #4

0

파일 보기

파일: DataLoader.py 프로젝트: Saftevand/ECHR_clustering

def load_dataset():
    documents = []

    with open(references_path) as reference_file:
        reference_dict = json.load(reference_file)

    with open(cases_path) as json_file:
        cases_json = json.load(json_file)

        for index, case in enumerate(cases_json):

            application_number = case['appno']
            document_name = case['docname']
            document_id = case['itemid']

            multiple_appnos = False

            # Alle keys hvor application number indgår
            matching_key = [
                key for key in reference_dict.keys()
                if application_number in key.split(';')
            ]
            multi_refs = []
            temp_internal_appnos = []
            for k in matching_key:
                multi_refs.extend(reference_dict[k].keys())

                split = k.split(';')
                temp_internal_appnos.extend(split)

            # alle outgoing refs fra "matching keys"
            multi_refs = remove_dups(multi_refs)

            # alle appnos i hver "matching key"
            temp_internal_appnos = remove_dups(temp_internal_appnos)

            document = Document.Document(application_id=application_number,
                                         document_id=document_id,
                                         title=document_name,
                                         references_appno=multi_refs,
                                         internal_appnos=temp_internal_appnos,
                                         multiple_appnos=multiple_appnos
                                         )  # related_appnos=remaining_appnos
            documents.append(document)
            print(f'Loaded document: {index}')
    for count, doc in enumerate(documents):
        print(f'reffing doc {count}')
        # tjekker alle refs (internal og outgoing)
        for outgoing in doc.all_refs:
            for other_doc in documents:
                # hvis appno er i andet docs internal appno --> ref fra doc til doc
                if outgoing in other_doc.internal_appnos:
                    doc.outgoing_refs[other_doc.document_id] = 1
                    break
    documents = add_articles_and_conclusion_to_documents(documents)
    adjacency_matrix = create_adjacency_matrix_from_references(documents)

    assign_pagerank_to_documents(documents, adjacency_matrix)

    return documents

예제 #5

0

파일 보기

파일: Find.py 프로젝트: TheHeadlessSourceMan/docStructure

    def finditer(self, doc):
        """
		doc can be a Document object or a plain text to convert into one
		
		each iteration is an array of words
		
		HINT: if you would rather have a (start,end) doc range, do 
		(result[0].location[0],result[-1].location[-1])
		"""
        if type(doc) == str or type(doc) == unicode:
            doc = Document.Document(doc)
        words = doc.words()
        endi = len(words) - len(self.matchers) + 1
        for i in range(endi):
            matched = []
            for j in range(i, endi):
                for m in self.matchers:
                    if not m.match(words[j]):
                        matched = None
                        break
                    matched.append(words[j])
                if matched == None:
                    break
            if matched != None:
                yield matched

예제 #6

0

파일 보기

    def add_dataset(self, folder_path):
        ds = Dataset(folder_path)

        for doc in ds.get_contents():
            document = Document(text=doc, features=self.features)
            document.analysis = self.analyze_source(source=document)
            self.documents.append(document)

예제 #7

0

파일 보기

파일: Parser.py 프로젝트: AKNOUCHEanis/Moteur_de_recherche

    def buildDocumentCollectionRegex(self, chemin_doc):

        collection = {}
        file = open(chemin_doc)
        doc = file.read()
        file.close()

        docs = doc.split(".I")

        for d in range(1, len(docs)):
            doc = Document.Document()

            id = re.search(r'(\d*|$)', docs[d][1:])
            value = re.search(r'\.W(.*?)\.', docs[d], re.DOTALL)

            doc.setId(id.group(0))

            if value is not None:

                doc.setTexte(value.group(1).replace("\n", ' '))
            else:
                doc.setTexte("")

            collection[id.group(0)] = doc

        return collection

예제 #8

0

파일 보기

파일: Parser.py 프로젝트: AKNOUCHEanis/Moteur_de_recherche

    def buildDocCollectionSimple2(self, chemin_doc):

        file = open(chemin_doc, "r")
        lignes = file.readlines()
        length = len(lignes)
        collection = {}

        i = 0

        while i < length:
            key = ""
            value = ""
            if ".I" in lignes[i]:
                key = lignes[i][3:-1]
                doc = Document.Document()
                doc.setId(key)
                i += 1
                while ".T" not in lignes[i]:
                    i += 1
                i += 1
                while i < length and "." not in lignes[i]:
                    value += lignes[i]
                    i += 1
                doc.setTexte(value[:-1])
                collection[key] = doc
            else:
                i += 1

        file.close()

        return collection

예제 #9

0

파일 보기

    def import_txt(self, txt_file):
        temp_txt = []
        for line in txt_file:
            if len(line.rstrip().split("\t")) > 1:
                label = line.rstrip().split("\t")[0]
                if label != '__label__neu':
                    sentence = line.rstrip().split("\t")[1]
                    doc = Document.Document(sentence, label)
                    self.docs.append(doc)
                    temp_txt.append(doc)
                    label_num = 0
                    if doc.label == '__label__pos':
                        label_num = 0  # positive label
                    elif doc.label == '__label__neg':
                        label_num = 1  # negative label
                    self.label_counts[label_num] += 1
                    for word in doc.words:
                        if word != '':
                            if word not in self.words:
                                value = WordValue()
                                self.words.append(word)
                                self.values.append(value)
                            self.values[int(word)].appear[label_num] += 1

        pass

예제 #10

0

파일 보기

 def createDocument(self, namespaceURI, qualifiedName, doctype):
     import Document
     doc = Document.Document(doctype)
     if qualifiedName:
         el = doc.createElementNS(namespaceURI, qualifiedName)
         doc.appendChild(el)
     return doc

예제 #11

0

파일 보기

파일: Parser.py 프로젝트: NabilDam/RITAL

    def buildDocumentCollectionRegex(fichier):
        """
            Construit l'index a partir d'une base de documents contenus dans fichier,
        On lit le fichier en entier et on utilise des expressions régulières pour
        récupère le contenu des balises

        :type fichier: String
        :param fichier: Le fichier qui contient les documents que l'on veut indexé

        :return: Un dictionnaire de d'object Document, dont les clef sont les id des
                Document.
                {"id1": Document1, "id2": Document2, ...}
        """
        resultat = dict()
        
        f = open(fichier, 'r')
        doc = f.read()
        docs = doc.split(".I")
    
        for di in range(1, len(docs)):
            d = Document.Document()
            id_doc = re.search(r'(\d*|$)', docs[di][1:]).group(0)
            d.setID(int(id_doc))
            m = re.search(r'\.T(.*?)\.', docs[di], re.DOTALL)
            if m is not None:
                d.setTexte(m.group(1).replace('\n', ' '))
                
            else:
                d.setTexte("")
            
            resultat[id_doc] = d
        f.close()
        return resultat

예제 #12

0

파일 보기

파일: UserGui.py 프로젝트: RyanPencak/TextAnalysis

 def readDocuments(self):
     i = 0
     while i < self.numFiles:
         self.docDictionary[i] = Document(self.fileDictionary[i])
         self.docDictionary[i].generateWhole()
         self.wordListDictionary[i] = self.docDictionary[i].getWordList()
         i += 1
     print("Read Documents.")

예제 #13

0

파일 보기

파일: ParserEasyClef.py 프로젝트: vinnysihombing/RI

 def getDocument(self, text):
     docId = re.search("\.I\s(.*)", text).group(1)
     docTitle = re.search("\.T\s(.*)", text).group(1)
     docText = re.search("\.W\s(.*)", text).group(1)
     docDate = re.search("\.B\s(.*)", text).group(1)
     doc = Doc.Document(docId, docTitle+"\n"+docText+"\n"+docDate,
                        others=dict())
     return doc

예제 #14

0

파일 보기

파일: coreference.py 프로젝트: vicjohnson1213/Coreference-Resolver

def find_coreferences(input_file, output_dir):
    doc = Document.Document(input_file, output_dir)
    tags = doc.tags

    for anaphor_idx in range(len(tags)):

        # Finding string matches
        if tags[anaphor_idx].content.lower() not in references.pronouns:
            for antecedent_idx in range(anaphor_idx - 1, -1, -1):
                if (head_word_match(tags[anaphor_idx].content,
                                    tags[antecedent_idx].content) or
                        overlap_similarity(tags[anaphor_idx].content,
                                           tags[antecedent_idx].content) >= .5
                        or tags[anaphor_idx].content.lower() in find_acronyms(
                            tags[antecedent_idx].content)
                        or tags[antecedent_idx].content.lower()
                        in find_acronyms(tags[anaphor_idx].content)):
                    tags[anaphor_idx].ref = tags[antecedent_idx].id

                    if not tags[antecedent_idx].ref:
                        tags[antecedent_idx].ref = tags[anaphor_idx].id

                    break

        # Look for pronouns
        else:
            for antecedent_idx in range(anaphor_idx - 1,
                                        max(-1, anaphor_idx - 10), -1):
                if tags[anaphor_idx].content.lower(
                ) == tags[antecedent_idx].content.lower():
                    tags[anaphor_idx].ref = tags[antecedent_idx].id
                    if not tags[antecedent_idx].ref:
                        tags[antecedent_idx].ref = tags[anaphor_idx].id
                elif pronoun_attribute_match(
                        tags[anaphor_idx].content.lower(),
                        tags[antecedent_idx].content.lower()):
                    tags[anaphor_idx].ref = tags[antecedent_idx].id
                    if not tags[antecedent_idx].ref:
                        tags[antecedent_idx].ref = tags[anaphor_idx].id

    unmatched_tags = [t for t in doc.tags if not t.ref]
    counter = 0
    for tag in unmatched_tags:
        TAG_RE = re.compile('(?:^|[\s])(' +
                            re.escape(tag.content.lower().split()[-1]) +
                            ')(?:$|[\.\s\,\!\?])')

        matches = re.findall(TAG_RE, doc.content)

        if matches:
            for match in matches:
                new_id = 'A' + str(counter)
                doc.tags.append(Document.Tag(new_id, tag.id, match))
                tag.ref = new_id
                counter += 1

    doc.save()

예제 #15

0

파일 보기

def getAllDocuments():
    files = os.listdir("./Docs")
    for i in range(len(files)):
        files[i] = "./Docs/" + files[i]
    docs = []
    for single_file in files:
        docs.append(Document(single_file))
    docs = sorted(docs)
    return docs

예제 #16

0

파일 보기

def getTechDocuments():
    files = os.listdir("./Docs3/Technology")
    for i in range(len(files)):
        files[i] = "./Docs3/Technology/" + files[i]
    docs: List[Document.Document] = []
    for single_file in files:
        x = Document.Document(single_file, 0)
        docs.append(x)
    docs = sorted(docs)
    return docs

예제 #17

0

파일 보기

 def run(self):
     for text in self.documents_list:
         document = Document(text)
         summary = document.get_summary(5)
         filename = text.split('\\')[-1]
         filename = filename[:filename.index(".")]
         with codecs.open(filename + ".txt", "w",
                          "utf-8-sig") as summary_file:
             summary_file.write(summary)
             summary_file.close()

예제 #18

0

파일 보기

파일: WeaselCorpusReader.py 프로젝트: pln-fing-udelar/weasel-detect

    def docs(self, fileids=None):
        """
		Returns a list of Document objects containing all of the documents in the files specified by the fileids parameter.
		"""
        result = []
        for fileid in self.abspaths(fileids):
            dom = parse(fileid)
            for doc in dom.getElementsByTagName('Document'):
                d = Document(doc)
                result.append(d)
        return result

예제 #19

0

파일 보기

    def add_source(self, data, kind):
        if SourceType.query == kind:
            query = Query(text=data, features=self.features)
            query.analysis = self.analyze_source(source=query)
            self.queries.append(query)

        elif SourceType.document == kind:
            document = Document(text=data, features=self.features)
            document.analysis = self.analyze_source(source=document)
            document.document_id = len(self.documents)
            self.documents.append(document)

예제 #20

0

파일 보기

    def rankFile(self, filename, measure='DRDC'):
        """Score a file rather than an entire RDG. NOT SUPPORTED!"""
        ranking = []
        d = Document(filename=filename, overwrite=overwrite)
        for w in d.counts:
            temp = self.metrics[measure](w)
            for s in [w]:
                ## Filter.unstem(w):
                ranking.append((s, temp))
##        # force ranks to be [0,1]
        ranking.sort(key=lambda x: x[1], reverse=True)
        return ranking

예제 #21

0

파일 보기

 def getDocs(
     self
 ):  #reads files in the library file directory and converts to Document.
     if not os.path.exists('docfiles'):
         os.makedirs('docfiles')
     for filename in os.listdir(
             'docfiles'):  # get the directory '.' + 'os.sep' +
         if filename.endswith('.txt'):
             with open('docfiles' + os.sep + filename, "rt") as f:
                 contents = f.read()
             doctags = []
             docContents = []
             pagetags = []
             convertedTagList = []
             pages = []
             section = contents.split("(/)")[1:]
             try:
                 assert (section[0].startswith("Title:"))
                 assert (section[1].startswith("Doctags:"))
                 assert (section[2].startswith("Pages:"))
                 assert (section[3].startswith("Pagetags:"))
             except AssertionError:
                 print("Attempted to read invalid document.")
                 continue
             title = section[0][len("Title:"):].strip()
             if not section[1][len("Doctags:"):].isspace():
                 doctagstr = section[1][len("Doctags:"):].strip()
                 for tag in doctagstr.split(","):
                     doctags.append(Tag(tag.strip()))
             if not section[2][len("Pages:"):].isspace():
                 pagestr = section[2][len("Pages:"):].strip()
                 for elem in pagestr.split("<pwords>"):
                     docContents.append(str(elem.strip()))
             pagetagstr = section[3][len("Pagetags:"):].strip()
             for elem in pagetagstr.split("<tname>"):
                 if not elem.isspace():
                     pagetags.append(elem.strip(
                     ))  # pagetags is a list with a string of tags
             for taglist in pagetags:
                 convertedPageTags = []
                 for tag in taglist.split(","):
                     if not tag.isspace() and not tag == "":
                         convertedPageTags.append(Tag(tag.strip()))
                 convertedTagList.append(convertedPageTags)
             makeTime = os.path.getctime
             editTime = os.path.getmtime
             for i in range(len(docContents)):
                 pages.append(
                     Page(self, docContents[i], convertedTagList[i]))
             self.documents.append(
                 Document(self, filename, title, f"{makeTime}",
                          f"{editTime}", doctags, pages))

예제 #22

0

파일 보기

파일: KeywordsTab.py 프로젝트: tonsam/Transana

    def Refresh(self):
        """ This method allows up to update all data when this tab is displayed.  This is necessary as the objects may have
            been changed since they were originally loaded.  """

        try:
            # If a Library Object is defined, reload it
            if self.seriesObj != None:
                self.seriesObj = Library.Library(self.seriesObj.number)
            # If an Episode Object is defined, reload it
            if self.episodeObj != None:
                self.episodeObj = Episode.Episode(self.episodeObj.number)
            # if a Documetn Object is defined, reload it
            if self.documentObj != None:
                self.documentObj = Document.Document(self.documentObj.number)
            # If a Collection Object is defined, reload it
            if self.collectionObj != None:
                self.collectionObj = Collection.Collection(
                    self.collectionObj.number)
            # If a Clip Object is defined, reload it.
            if self.clipObj != None:
                self.clipObj = Clip.Clip(self.clipObj.number)
            # If a Quote Object is defined, reload it.
            if self.quoteObj != None:
                self.quoteObj = Quote.Quote(self.quoteObj.number)
            # Get the local keyword list pointer aimed at the appropriate source object.
            # NOTE:  If a Clip is defined use it (whether an episode is defined or not.)  If
            #        no clip is defined but an episode is defined, use that.
            if self.clipObj != None:
                self.kwlist = self.clipObj.keyword_list
            elif self.episodeObj != None:
                self.kwlist = self.episodeObj.keyword_list
            elif self.documentObj != None:
                self.kwlist = self.documentObj.keyword_list
            elif self.quoteObj != None:
                self.kwlist = self.quoteObj.keyword_list

            # Update the Tab Display
            self.UpdateKeywords()
        except TransanaExceptions.RecordNotFoundError:
            msg = _(
                "The appropriate Keyword data could not be loaded from the database."
            )
            if not TransanaConstants.singleUserVersion:
                msg += '\n' + _(
                    "This data may have been deleted by another user.")
            tmpDlg = Dialogs.ErrorDialog(self.parent, msg)
            tmpDlg.ShowModal()
            tmpDlg.Destroy()
            # Return to the database tab
            self.parent.parent.ControlObject.ShowDataTab(0)

예제 #23

0

파일 보기

파일: GUI.py 프로젝트: fuxing4203/DecisionTree

 def __init__(self):
     '''
     This function inherits from the GUI class and constructs the interface frames for
     textfilters, which contains three frames that leads to three frames doing separate works
     The three frames are the fileframe filterframe and a state frame that tells whether the
     filters are being applied
     '''
     super().__init__(root)
     self.forget()
     self.variables = []
     self.fileFrame = LabelFrame(root, text='Files')
     self.files()
     self.fileFrame.grid(row=2,
                         column=0,
                         columnspan=2,
                         sticky=W + E + N + S)
     self.filterframe = LabelFrame(root,
                                   text='Text Filters, Check to Apply')
     self.filters()
     self.filterframe.grid(row=2, column=2, columnspan=2, sticky=N)
     self.stateFrame = LabelFrame(root, text='State')
     self.state()
     self.stateFrame.grid(row=2,
                          column=4,
                          columnspan=1,
                          sticky=W + E + N + S)
     if GUI.fileObj == []:
         for thefile in GUI.fileName:
             GUI.fileObj.append(Document(thefile))
         for D in GUI.fileObj:
             D.generateWhole()
     elif len(GUI.fileObj) != len(GUI.fileName):
         lengthD = len(GUI.fileName) - len(GUI.fileObj)
         for i in range(lengthD)[::-1]:
             GUI.fileObj.append(Document(GUI.fileName[-(1 + i)]))
             GUI.fileObj[-1].generateWhole()

예제 #24

0

파일 보기

파일: Main.py 프로젝트: msadat3/Text_Rank

def calculate_MRR(document_directory_location, gold_label_directory_location,
                  stopwords, k, alpha, num_iterations, window):
    Reciprocal_ranks = []
    for root, dirs, files in os.walk(document_directory_location):
        for file in files:
            if p.exists(gold_label_directory_location + file):
                doc = Document(document_directory_location + file,
                               gold_label_directory_location + file, stopwords)
                doc.Preprocess()
                doc.Create_word_graph(window)
                rec = doc.get_reciprocal_rank(k, alpha, num_iterations)
                #print(file,rec)
                Reciprocal_ranks.append(rec)
    mrr = sum(Reciprocal_ranks) / len(Reciprocal_ranks)
    print('Window =', window, "K =", k, 'MRR =', mrr)

예제 #25

0

파일 보기

파일: Recognizer.py 프로젝트: Comunitea/NanScan

    def extractWithTemplate(self, template, xOffset=0, yOffset=0):
        if not template:
            return None
        document = Document()
        for templateBox in template.boxes:
            rect = QRectF(templateBox.rect)
            rect.translate(xOffset, yOffset)

            text = self.textInRegion(rect, templateBox.recognizer)
            text = self.filter(text, templateBox.filter)
            documentBox = DocumentBox()
            documentBox.name = templateBox.name
            documentBox.text = text
            documentBox.templateBox = templateBox
            document.addBox(documentBox)
        return document

예제 #26

0

파일 보기

def normalize():
    docs_name = []
    for doc in glob.glob("*.persian_poem"):
        docs_name.append(doc)

    docs_content = []
    for doc in docs_name:
        raw_input = open(doc, "r").read()
        docs_content.append(raw_input)

    docs = []
    for index, doc in enumerate(docs_name):
        doc_obj = Document.Document(doc, docs_content[index])
        docs.append(doc_obj)

    return docs

예제 #27

0

파일 보기

파일: NormalizedForm.py 프로젝트: chegainthegithub/classification

def fill_doc_list_train(file_path, amount=0):
    documents_list = []
    docs_splitted = []

    with open(file_path, 'r', encoding='utf8') as infile:
        for line in infile:
            docs_splitted.append(line)
            if not amount:
                if (len(docs_splitted)) == amount:
                    break

    for doc in docs_splitted:
        single_doc = doc.split("\t")
        new_doc = Dc.Document(single_doc[0], single_doc[1], single_doc[2])
        documents_list.append(new_doc)
    # list of Document class objects
    return documents_list

예제 #28

0

파일 보기

def normalize(documents):

    queries_name = []
    for query in glob.glob("*.persian_query"):
        queries_name.append(query)

    queries_content = []
    for query in queries_name:
        raw_input = open(query, "r").read()
        queries_content.append(raw_input)

    queries = []
    for index, query in enumerate(queries_name):
        query_obj = Document.Document(query, queries_content[index], documents)
        queries.append(query_obj)

    return queries

예제 #29

0

파일 보기

 def import_docs(self, docs_file):
     temp_docs = []
     for line in docs_file:
         if len(line.rstrip().split(":")) > 1:
             label = line.rstrip().split(":")[0]
             if label != 'neu':
                 sentence = line.rstrip().split(":")[1]
                 doc = Document.Document(sentence, label)
                 self.docs.append(doc)
                 temp_docs.append(doc)
                 if doc.label == 'pos':
                     label_num = 0  # positive label
                 elif doc.label == 'neg':
                     label_num = 1  # negative label
                 self.label_counts[label_num] += 1
                 for word in doc.words:
                     if word != '':
                         self.values[int(word)].appear[label_num] += 1
     self.docs_set.append(temp_docs)

예제 #30

0

파일 보기

파일: Recognizer.py 프로젝트: Comunitea/NanScan

    def findMatchingTemplateByOffset(self, templates, offset=5):
        max = 0
        best = {
            'template': None,
            'document': Document(),
            'xOffset': 0,
            'yOffset': 0
        }
        for template in templates:
            if not template.boxes:
                continue
            # Consider up to 5 millimeter offset
            for xOffset in range(-5, 6):
                for yOffset in range(-5, 6):
                    score = 0
                    matcherBoxes = 0
                    currentDocument = self.extractWithTemplate(
                        template, xOffset, yOffset)
                    for documentBox in currentDocument.boxes:
                        templateBox = documentBox.templateBox
                        if documentBox.templateBox.type != 'matcher':
                            print "Jumping %s due to type %s" % (
                                templateBox.name, templateBox.type)
                            continue
                        matcherBoxes += 1
                        similarity = Trigram.trigram(documentBox.text,
                                                     templateBox.text)
                        score += similarity

                    if matcherBoxes:
                        score = score / matcherBoxes

                    if score > max or not matcherBoxes:
                        max = score
                        best = {
                            'template': template,
                            'document': currentDocument,
                            'xOffset': xOffset,
                            'yOffset': yOffset
                        }
                    print "Template %s has score %s with offset (%s,%s)" % (
                        template.name, score, xOffset, yOffset)
        return best