예제 #1
0
 def applyfilters(self, variables, filterscs):
     '''
     This method applies the filters to the files by taking in the two list, one of which contains
     the files checked to apply, the other one contains the filters checked to apply, and this
     method will change the state string accordingly
     '''
     if GUI.fileObj == []:
         for thefile in GUI.fileName:
             GUI.fileObj.append(Document(thefile))
         for D in GUI.fileObj:
             D.generateWhole()
     elif len(GUI.fileObj) != len(GUI.fileName):
         lengthD = len(GUI.fileName) - len(GUI.fileObj)
         for i in range(lengthD)[::-1]:
             GUI.fileObj.append(Document(GUI.fileName[-(1 + i)]))
             GUI.fileObj[-1].generateWhole()
     empty = True
     for i in range(len(variables)):
         if variables[i].get() == 1:
             dofil = TextFilter(GUI.fileObj[i])
             dofil.apply([
                 self.filters[j]
                 for j in range(len(filterscs)) if filterscs[j].get() == 1
             ], GUI.fileObj[i])
             empty = False
     if empty == True:
         self.sti = 'Nothing applied'
     else:
         self.sti = 'Successfully applied'
     self.sti_text.set(self.sti)
예제 #2
0
    def read_newdata(self,dfile,wordmapfile):
        word2id = {}
        id2_id = {}
        self.read_wordmap1(wordmapfile,word2id)   # 从wordmapfile中读入word2id
        if len(word2id) <= 0 :
            print("No word map available!\n")
            return 1
        fin = open(dfile)
        if fin == None :
            print("Cannot open file ",dfile," to read!\n")
            return 1
        line = fin.readline()
        # get the number of new documents
        self.M = int(line)
        if self.M <= 0 :
            print("No document available!\n")
            return 1
        self.V = 0

        for i in range(self.M) :
            line = fin.readline()
            strtok = Strtokenizer(line,' \t\r\n')
            length = strtok.count_tokens()
            doc = []
            _doc = []
            for j in range(length) :
                found = False
                for key in word2id.keys():
                    if strtok.token(j) == key:
                        found = True
                        break
                if found:   # 找到 key
                    found2 = False
                    for value in id2_id.values() :
                        if value == word2id[strtok.token(j)] :
                            found2 = True
                            break
                    if not found2 : # 没找到 value
                        _id = len(id2_id)
                        id2_id[word2id[strtok.token(j)]] = _id
                        self._id2id[_id] = word2id[strtok.token(j)]
                    else :  # 找到
                        _id = word2id[strtok.token(j)]
                    doc.append(word2id[strtok.token(j)])
                    _doc.append(_id)
                else:           # 没找到
                    tmp = ''
                    # 没决定要做什么

            pdoc = Document(doc)
            _pdoc = Document(_doc)
            self.add_doc(pdoc,i)
            self._add_doc(_pdoc,i)

        fin.close()
        self.V = len(id2_id)
        return 0
예제 #3
0
    def read_file(self, file):
        #read in a single file from corpus
        #parse to sentence class and word class but fully document class
        doc_to_return = Document()
        lines = file.readlines()
        pre_sent = []
        sent_index = 0
        is_offset = 1
        doc_list = []
        for line in lines:
            if not line.startswith('#'):
                if line.isspace():
                    doc_to_return.sentences.append(
                        Sentence(pre_sent.copy(), sent_index))
                    pre_sent.clear()
                    sent_index += 1
                else:
                    entries = line.split()
                    # Set Doc Id
                    if not doc_to_return.docID:
                        doc_to_return.get_doc_id(entries[0])

                    # Construct word
                    word = Word()
                    word.set_entries(entries)
                    pre_sent.append(word)
                    # Create mapping for resolving difference in ISNotes offset and OntoNotes
                    #doc_to_return.coref_to_ISoffset[is_offset] = (doc_to_return.header, sent_index, int(entries[2]))
                    #is_offset += 1
            elif line.startswith('#begin'):
                doc_to_return = Document()
                doc_to_return.header = line
            else:
                doc_to_return.end = line
                doc_list.append(doc_to_return)
                sent_index = 0
                doc_to_return = Document()

        # for sent in doc_to_return.sentences:
        #     sent.get_names()  # May no longer be needed. since exists in make_mentions in sentence_init

        # Construct gold coreference clusters
        # for doc in doc_list:
        #     for sent in doc.sentences:
        #         for m in sent.mentions:
        #             if m.gold_cluster_id is not None:
        #                 doc.gold_clusters[m.gold_cluster_id].append(m)

        return doc_list
예제 #4
0
def load_dataset():
    documents = []

    with open(references_path) as reference_file:
        reference_dict = json.load(reference_file)

    with open(cases_path) as json_file:
        cases_json = json.load(json_file)

        for index, case in enumerate(cases_json):

            application_number = case['appno']
            document_name = case['docname']
            document_id = case['itemid']

            multiple_appnos = False

            # Alle keys hvor application number indgår
            matching_key = [
                key for key in reference_dict.keys()
                if application_number in key.split(';')
            ]
            multi_refs = []
            temp_internal_appnos = []
            for k in matching_key:
                multi_refs.extend(reference_dict[k].keys())

                split = k.split(';')
                temp_internal_appnos.extend(split)

            # alle outgoing refs fra "matching keys"
            multi_refs = remove_dups(multi_refs)

            # alle appnos i hver "matching key"
            temp_internal_appnos = remove_dups(temp_internal_appnos)

            document = Document.Document(application_id=application_number,
                                         document_id=document_id,
                                         title=document_name,
                                         references_appno=multi_refs,
                                         internal_appnos=temp_internal_appnos,
                                         multiple_appnos=multiple_appnos
                                         )  # related_appnos=remaining_appnos
            documents.append(document)
            print(f'Loaded document: {index}')
    for count, doc in enumerate(documents):
        print(f'reffing doc {count}')
        # tjekker alle refs (internal og outgoing)
        for outgoing in doc.all_refs:
            for other_doc in documents:
                # hvis appno er i andet docs internal appno --> ref fra doc til doc
                if outgoing in other_doc.internal_appnos:
                    doc.outgoing_refs[other_doc.document_id] = 1
                    break
    documents = add_articles_and_conclusion_to_documents(documents)
    adjacency_matrix = create_adjacency_matrix_from_references(documents)

    assign_pagerank_to_documents(documents, adjacency_matrix)

    return documents
예제 #5
0
    def finditer(self, doc):
        """
		doc can be a Document object or a plain text to convert into one
		
		each iteration is an array of words
		
		HINT: if you would rather have a (start,end) doc range, do 
		(result[0].location[0],result[-1].location[-1])
		"""
        if type(doc) == str or type(doc) == unicode:
            doc = Document.Document(doc)
        words = doc.words()
        endi = len(words) - len(self.matchers) + 1
        for i in range(endi):
            matched = []
            for j in range(i, endi):
                for m in self.matchers:
                    if not m.match(words[j]):
                        matched = None
                        break
                    matched.append(words[j])
                if matched == None:
                    break
            if matched != None:
                yield matched
예제 #6
0
    def add_dataset(self, folder_path):
        ds = Dataset(folder_path)

        for doc in ds.get_contents():
            document = Document(text=doc, features=self.features)
            document.analysis = self.analyze_source(source=document)
            self.documents.append(document)
예제 #7
0
    def buildDocumentCollectionRegex(self, chemin_doc):

        collection = {}
        file = open(chemin_doc)
        doc = file.read()
        file.close()

        docs = doc.split(".I")

        for d in range(1, len(docs)):
            doc = Document.Document()

            id = re.search(r'(\d*|$)', docs[d][1:])
            value = re.search(r'\.W(.*?)\.', docs[d], re.DOTALL)

            doc.setId(id.group(0))

            if value is not None:

                doc.setTexte(value.group(1).replace("\n", ' '))
            else:
                doc.setTexte("")

            collection[id.group(0)] = doc

        return collection
예제 #8
0
    def buildDocCollectionSimple2(self, chemin_doc):

        file = open(chemin_doc, "r")
        lignes = file.readlines()
        length = len(lignes)
        collection = {}

        i = 0

        while i < length:
            key = ""
            value = ""
            if ".I" in lignes[i]:
                key = lignes[i][3:-1]
                doc = Document.Document()
                doc.setId(key)
                i += 1
                while ".T" not in lignes[i]:
                    i += 1
                i += 1
                while i < length and "." not in lignes[i]:
                    value += lignes[i]
                    i += 1
                doc.setTexte(value[:-1])
                collection[key] = doc
            else:
                i += 1

        file.close()

        return collection
예제 #9
0
    def import_txt(self, txt_file):
        temp_txt = []
        for line in txt_file:
            if len(line.rstrip().split("\t")) > 1:
                label = line.rstrip().split("\t")[0]
                if label != '__label__neu':
                    sentence = line.rstrip().split("\t")[1]
                    doc = Document.Document(sentence, label)
                    self.docs.append(doc)
                    temp_txt.append(doc)
                    label_num = 0
                    if doc.label == '__label__pos':
                        label_num = 0  # positive label
                    elif doc.label == '__label__neg':
                        label_num = 1  # negative label
                    self.label_counts[label_num] += 1
                    for word in doc.words:
                        if word != '':
                            if word not in self.words:
                                value = WordValue()
                                self.words.append(word)
                                self.values.append(value)
                            self.values[int(word)].appear[label_num] += 1

        pass
예제 #10
0
 def createDocument(self, namespaceURI, qualifiedName, doctype):
     import Document
     doc = Document.Document(doctype)
     if qualifiedName:
         el = doc.createElementNS(namespaceURI, qualifiedName)
         doc.appendChild(el)
     return doc
예제 #11
0
파일: Parser.py 프로젝트: NabilDam/RITAL
    def buildDocumentCollectionRegex(fichier):
        """
            Construit l'index a partir d'une base de documents contenus dans fichier,
        On lit le fichier en entier et on utilise des expressions régulières pour
        récupère le contenu des balises

        :type fichier: String
        :param fichier: Le fichier qui contient les documents que l'on veut indexé

        :return: Un dictionnaire de d'object Document, dont les clef sont les id des
                Document.
                {"id1": Document1, "id2": Document2, ...}
        """
        resultat = dict()
        
        f = open(fichier, 'r')
        doc = f.read()
        docs = doc.split(".I")
    
        for di in range(1, len(docs)):
            d = Document.Document()
            id_doc = re.search(r'(\d*|$)', docs[di][1:]).group(0)
            d.setID(int(id_doc))
            m = re.search(r'\.T(.*?)\.', docs[di], re.DOTALL)
            if m is not None:
                d.setTexte(m.group(1).replace('\n', ' '))
                
            else:
                d.setTexte("")
            
            resultat[id_doc] = d
        f.close()
        return resultat
예제 #12
0
 def readDocuments(self):
     i = 0
     while i < self.numFiles:
         self.docDictionary[i] = Document(self.fileDictionary[i])
         self.docDictionary[i].generateWhole()
         self.wordListDictionary[i] = self.docDictionary[i].getWordList()
         i += 1
     print("Read Documents.")
예제 #13
0
 def getDocument(self, text):
     docId = re.search("\.I\s(.*)", text).group(1)
     docTitle = re.search("\.T\s(.*)", text).group(1)
     docText = re.search("\.W\s(.*)", text).group(1)
     docDate = re.search("\.B\s(.*)", text).group(1)
     doc = Doc.Document(docId, docTitle+"\n"+docText+"\n"+docDate,
                        others=dict())
     return doc
def find_coreferences(input_file, output_dir):
    doc = Document.Document(input_file, output_dir)
    tags = doc.tags

    for anaphor_idx in range(len(tags)):

        # Finding string matches
        if tags[anaphor_idx].content.lower() not in references.pronouns:
            for antecedent_idx in range(anaphor_idx - 1, -1, -1):
                if (head_word_match(tags[anaphor_idx].content,
                                    tags[antecedent_idx].content) or
                        overlap_similarity(tags[anaphor_idx].content,
                                           tags[antecedent_idx].content) >= .5
                        or tags[anaphor_idx].content.lower() in find_acronyms(
                            tags[antecedent_idx].content)
                        or tags[antecedent_idx].content.lower()
                        in find_acronyms(tags[anaphor_idx].content)):
                    tags[anaphor_idx].ref = tags[antecedent_idx].id

                    if not tags[antecedent_idx].ref:
                        tags[antecedent_idx].ref = tags[anaphor_idx].id

                    break

        # Look for pronouns
        else:
            for antecedent_idx in range(anaphor_idx - 1,
                                        max(-1, anaphor_idx - 10), -1):
                if tags[anaphor_idx].content.lower(
                ) == tags[antecedent_idx].content.lower():
                    tags[anaphor_idx].ref = tags[antecedent_idx].id
                    if not tags[antecedent_idx].ref:
                        tags[antecedent_idx].ref = tags[anaphor_idx].id
                elif pronoun_attribute_match(
                        tags[anaphor_idx].content.lower(),
                        tags[antecedent_idx].content.lower()):
                    tags[anaphor_idx].ref = tags[antecedent_idx].id
                    if not tags[antecedent_idx].ref:
                        tags[antecedent_idx].ref = tags[anaphor_idx].id

    unmatched_tags = [t for t in doc.tags if not t.ref]
    counter = 0
    for tag in unmatched_tags:
        TAG_RE = re.compile('(?:^|[\s])(' +
                            re.escape(tag.content.lower().split()[-1]) +
                            ')(?:$|[\.\s\,\!\?])')

        matches = re.findall(TAG_RE, doc.content)

        if matches:
            for match in matches:
                new_id = 'A' + str(counter)
                doc.tags.append(Document.Tag(new_id, tag.id, match))
                tag.ref = new_id
                counter += 1

    doc.save()
예제 #15
0
def getAllDocuments():
    files = os.listdir("./Docs")
    for i in range(len(files)):
        files[i] = "./Docs/" + files[i]
    docs = []
    for single_file in files:
        docs.append(Document(single_file))
    docs = sorted(docs)
    return docs
예제 #16
0
def getTechDocuments():
    files = os.listdir("./Docs3/Technology")
    for i in range(len(files)):
        files[i] = "./Docs3/Technology/" + files[i]
    docs: List[Document.Document] = []
    for single_file in files:
        x = Document.Document(single_file, 0)
        docs.append(x)
    docs = sorted(docs)
    return docs
예제 #17
0
 def run(self):
     for text in self.documents_list:
         document = Document(text)
         summary = document.get_summary(5)
         filename = text.split('\\')[-1]
         filename = filename[:filename.index(".")]
         with codecs.open(filename + ".txt", "w",
                          "utf-8-sig") as summary_file:
             summary_file.write(summary)
             summary_file.close()
    def docs(self, fileids=None):
        """
		Returns a list of Document objects containing all of the documents in the files specified by the fileids parameter.
		"""
        result = []
        for fileid in self.abspaths(fileids):
            dom = parse(fileid)
            for doc in dom.getElementsByTagName('Document'):
                d = Document(doc)
                result.append(d)
        return result
예제 #19
0
    def add_source(self, data, kind):
        if SourceType.query == kind:
            query = Query(text=data, features=self.features)
            query.analysis = self.analyze_source(source=query)
            self.queries.append(query)

        elif SourceType.document == kind:
            document = Document(text=data, features=self.features)
            document.analysis = self.analyze_source(source=document)
            document.document_id = len(self.documents)
            self.documents.append(document)
예제 #20
0
    def rankFile(self, filename, measure='DRDC'):
        """Score a file rather than an entire RDG. NOT SUPPORTED!"""
        ranking = []
        d = Document(filename=filename, overwrite=overwrite)
        for w in d.counts:
            temp = self.metrics[measure](w)
            for s in [w]:
                ## Filter.unstem(w):
                ranking.append((s, temp))
##        # force ranks to be [0,1]
        ranking.sort(key=lambda x: x[1], reverse=True)
        return ranking
예제 #21
0
 def getDocs(
     self
 ):  #reads files in the library file directory and converts to Document.
     if not os.path.exists('docfiles'):
         os.makedirs('docfiles')
     for filename in os.listdir(
             'docfiles'):  # get the directory '.' + 'os.sep' +
         if filename.endswith('.txt'):
             with open('docfiles' + os.sep + filename, "rt") as f:
                 contents = f.read()
             doctags = []
             docContents = []
             pagetags = []
             convertedTagList = []
             pages = []
             section = contents.split("(/)")[1:]
             try:
                 assert (section[0].startswith("Title:"))
                 assert (section[1].startswith("Doctags:"))
                 assert (section[2].startswith("Pages:"))
                 assert (section[3].startswith("Pagetags:"))
             except AssertionError:
                 print("Attempted to read invalid document.")
                 continue
             title = section[0][len("Title:"):].strip()
             if not section[1][len("Doctags:"):].isspace():
                 doctagstr = section[1][len("Doctags:"):].strip()
                 for tag in doctagstr.split(","):
                     doctags.append(Tag(tag.strip()))
             if not section[2][len("Pages:"):].isspace():
                 pagestr = section[2][len("Pages:"):].strip()
                 for elem in pagestr.split("<pwords>"):
                     docContents.append(str(elem.strip()))
             pagetagstr = section[3][len("Pagetags:"):].strip()
             for elem in pagetagstr.split("<tname>"):
                 if not elem.isspace():
                     pagetags.append(elem.strip(
                     ))  # pagetags is a list with a string of tags
             for taglist in pagetags:
                 convertedPageTags = []
                 for tag in taglist.split(","):
                     if not tag.isspace() and not tag == "":
                         convertedPageTags.append(Tag(tag.strip()))
                 convertedTagList.append(convertedPageTags)
             makeTime = os.path.getctime
             editTime = os.path.getmtime
             for i in range(len(docContents)):
                 pages.append(
                     Page(self, docContents[i], convertedTagList[i]))
             self.documents.append(
                 Document(self, filename, title, f"{makeTime}",
                          f"{editTime}", doctags, pages))
예제 #22
0
    def Refresh(self):
        """ This method allows up to update all data when this tab is displayed.  This is necessary as the objects may have
            been changed since they were originally loaded.  """

        try:
            # If a Library Object is defined, reload it
            if self.seriesObj != None:
                self.seriesObj = Library.Library(self.seriesObj.number)
            # If an Episode Object is defined, reload it
            if self.episodeObj != None:
                self.episodeObj = Episode.Episode(self.episodeObj.number)
            # if a Documetn Object is defined, reload it
            if self.documentObj != None:
                self.documentObj = Document.Document(self.documentObj.number)
            # If a Collection Object is defined, reload it
            if self.collectionObj != None:
                self.collectionObj = Collection.Collection(
                    self.collectionObj.number)
            # If a Clip Object is defined, reload it.
            if self.clipObj != None:
                self.clipObj = Clip.Clip(self.clipObj.number)
            # If a Quote Object is defined, reload it.
            if self.quoteObj != None:
                self.quoteObj = Quote.Quote(self.quoteObj.number)
            # Get the local keyword list pointer aimed at the appropriate source object.
            # NOTE:  If a Clip is defined use it (whether an episode is defined or not.)  If
            #        no clip is defined but an episode is defined, use that.
            if self.clipObj != None:
                self.kwlist = self.clipObj.keyword_list
            elif self.episodeObj != None:
                self.kwlist = self.episodeObj.keyword_list
            elif self.documentObj != None:
                self.kwlist = self.documentObj.keyword_list
            elif self.quoteObj != None:
                self.kwlist = self.quoteObj.keyword_list

            # Update the Tab Display
            self.UpdateKeywords()
        except TransanaExceptions.RecordNotFoundError:
            msg = _(
                "The appropriate Keyword data could not be loaded from the database."
            )
            if not TransanaConstants.singleUserVersion:
                msg += '\n' + _(
                    "This data may have been deleted by another user.")
            tmpDlg = Dialogs.ErrorDialog(self.parent, msg)
            tmpDlg.ShowModal()
            tmpDlg.Destroy()
            # Return to the database tab
            self.parent.parent.ControlObject.ShowDataTab(0)
예제 #23
0
 def __init__(self):
     '''
     This function inherits from the GUI class and constructs the interface frames for
     textfilters, which contains three frames that leads to three frames doing separate works
     The three frames are the fileframe filterframe and a state frame that tells whether the
     filters are being applied
     '''
     super().__init__(root)
     self.forget()
     self.variables = []
     self.fileFrame = LabelFrame(root, text='Files')
     self.files()
     self.fileFrame.grid(row=2,
                         column=0,
                         columnspan=2,
                         sticky=W + E + N + S)
     self.filterframe = LabelFrame(root,
                                   text='Text Filters, Check to Apply')
     self.filters()
     self.filterframe.grid(row=2, column=2, columnspan=2, sticky=N)
     self.stateFrame = LabelFrame(root, text='State')
     self.state()
     self.stateFrame.grid(row=2,
                          column=4,
                          columnspan=1,
                          sticky=W + E + N + S)
     if GUI.fileObj == []:
         for thefile in GUI.fileName:
             GUI.fileObj.append(Document(thefile))
         for D in GUI.fileObj:
             D.generateWhole()
     elif len(GUI.fileObj) != len(GUI.fileName):
         lengthD = len(GUI.fileName) - len(GUI.fileObj)
         for i in range(lengthD)[::-1]:
             GUI.fileObj.append(Document(GUI.fileName[-(1 + i)]))
             GUI.fileObj[-1].generateWhole()
예제 #24
0
파일: Main.py 프로젝트: msadat3/Text_Rank
def calculate_MRR(document_directory_location, gold_label_directory_location,
                  stopwords, k, alpha, num_iterations, window):
    Reciprocal_ranks = []
    for root, dirs, files in os.walk(document_directory_location):
        for file in files:
            if p.exists(gold_label_directory_location + file):
                doc = Document(document_directory_location + file,
                               gold_label_directory_location + file, stopwords)
                doc.Preprocess()
                doc.Create_word_graph(window)
                rec = doc.get_reciprocal_rank(k, alpha, num_iterations)
                #print(file,rec)
                Reciprocal_ranks.append(rec)
    mrr = sum(Reciprocal_ranks) / len(Reciprocal_ranks)
    print('Window =', window, "K =", k, 'MRR =', mrr)
예제 #25
0
    def extractWithTemplate(self, template, xOffset=0, yOffset=0):
        if not template:
            return None
        document = Document()
        for templateBox in template.boxes:
            rect = QRectF(templateBox.rect)
            rect.translate(xOffset, yOffset)

            text = self.textInRegion(rect, templateBox.recognizer)
            text = self.filter(text, templateBox.filter)
            documentBox = DocumentBox()
            documentBox.name = templateBox.name
            documentBox.text = text
            documentBox.templateBox = templateBox
            document.addBox(documentBox)
        return document
예제 #26
0
def normalize():
    docs_name = []
    for doc in glob.glob("*.persian_poem"):
        docs_name.append(doc)

    docs_content = []
    for doc in docs_name:
        raw_input = open(doc, "r").read()
        docs_content.append(raw_input)

    docs = []
    for index, doc in enumerate(docs_name):
        doc_obj = Document.Document(doc, docs_content[index])
        docs.append(doc_obj)

    return docs
def fill_doc_list_train(file_path, amount=0):
    documents_list = []
    docs_splitted = []

    with open(file_path, 'r', encoding='utf8') as infile:
        for line in infile:
            docs_splitted.append(line)
            if not amount:
                if (len(docs_splitted)) == amount:
                    break

    for doc in docs_splitted:
        single_doc = doc.split("\t")
        new_doc = Dc.Document(single_doc[0], single_doc[1], single_doc[2])
        documents_list.append(new_doc)
    # list of Document class objects
    return documents_list
예제 #28
0
def normalize(documents):

    queries_name = []
    for query in glob.glob("*.persian_query"):
        queries_name.append(query)

    queries_content = []
    for query in queries_name:
        raw_input = open(query, "r").read()
        queries_content.append(raw_input)

    queries = []
    for index, query in enumerate(queries_name):
        query_obj = Document.Document(query, queries_content[index], documents)
        queries.append(query_obj)

    return queries
예제 #29
0
 def import_docs(self, docs_file):
     temp_docs = []
     for line in docs_file:
         if len(line.rstrip().split(":")) > 1:
             label = line.rstrip().split(":")[0]
             if label != 'neu':
                 sentence = line.rstrip().split(":")[1]
                 doc = Document.Document(sentence, label)
                 self.docs.append(doc)
                 temp_docs.append(doc)
                 if doc.label == 'pos':
                     label_num = 0  # positive label
                 elif doc.label == 'neg':
                     label_num = 1  # negative label
                 self.label_counts[label_num] += 1
                 for word in doc.words:
                     if word != '':
                         self.values[int(word)].appear[label_num] += 1
     self.docs_set.append(temp_docs)
예제 #30
0
    def findMatchingTemplateByOffset(self, templates, offset=5):
        max = 0
        best = {
            'template': None,
            'document': Document(),
            'xOffset': 0,
            'yOffset': 0
        }
        for template in templates:
            if not template.boxes:
                continue
            # Consider up to 5 millimeter offset
            for xOffset in range(-5, 6):
                for yOffset in range(-5, 6):
                    score = 0
                    matcherBoxes = 0
                    currentDocument = self.extractWithTemplate(
                        template, xOffset, yOffset)
                    for documentBox in currentDocument.boxes:
                        templateBox = documentBox.templateBox
                        if documentBox.templateBox.type != 'matcher':
                            print "Jumping %s due to type %s" % (
                                templateBox.name, templateBox.type)
                            continue
                        matcherBoxes += 1
                        similarity = Trigram.trigram(documentBox.text,
                                                     templateBox.text)
                        score += similarity

                    if matcherBoxes:
                        score = score / matcherBoxes

                    if score > max or not matcherBoxes:
                        max = score
                        best = {
                            'template': template,
                            'document': currentDocument,
                            'xOffset': xOffset,
                            'yOffset': yOffset
                        }
                    print "Template %s has score %s with offset (%s,%s)" % (
                        template.name, score, xOffset, yOffset)
        return best