Ejemplo n.º 1
0
    def __init__(self):
        self.alphabet = [
            'پ', 'چ', 'ج', 'ح', 'خ', 'ه', 'ع', 'غ', 'ف', 'ق', 'ث', 'ص', 'ض',
            'گ', 'ک', 'م', 'ن', 'ت', 'ا', 'ل', 'ب', 'آ', 'ی', 'س', 'ش', 'و',
            'ئ', 'د', 'ذ', 'ر', 'ز', 'ط', 'ظ', 'أ', 'ژ', '\u200c', 'ُ', 'ّ',
            'ة', 'ۀ', 'ؤ', 'ء', 'إ'
        ]
        self.constants = ConstantVars()

        self.after_verbs = {'ام', 'ای', 'ایم', 'اید', 'اند'}

        self.before_verbs = {'می', 'نمی'}

        self.suffixes = {
            'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام',
            'ات', 'اش'
        }

        self.expression = []
        self.expression.extend([
            ('علی ای حال', 'علی\u200cای\u200cحال'),
            ('بنا بر این', 'بنابراین'),
            ('بنابر این', 'بنابراین'),
            ('مع ذالک', 'مع\u200cذالک'),
            ('فی ما بین', 'فی\u200cمابین'),
            ('فی مابین', 'فی\u200cمابین'),
            ('چنان چه', 'چنان\u200cچه'),
            ('در واقع', 'درواقع'),
            ('فی کل حال', 'فی\u200cکل\u200cحال'),
        ])
        self.expression = compile_patterns(self.expression)
Ejemplo n.º 2
0
 def __init__(self, i):
     self.wordFormer = FormWords()
     self.indexTasks = i
     self.constants = ConstantVars()
     self.relatedDocs = np.array([dict() for i in range(100)])
     self.notRelatedDocs = np.array([dict() for i in range(100)])
     self.notRelatedCounts = 0
Ejemplo n.º 3
0
 def __init__(self):
     self.input = FileInOut()
     self.wordFormer = FormWords()
     self.constants = ConstantVars()
     self.dictionary = dict()
     self.posting_list = np.array([dict() for j in range(150000)])
     self.dicIndex = 0
     self.docIndex = 0
     self.c = 0
Ejemplo n.º 4
0
 def __init__(self):
     self.input = FileInOut()
     self.Dic = self.input.readDic()
     self.DocID_file = self.input.readDocID()
     self.posting_file = self.input.readPostingList()
     self.wordFormer = FormWords()
     self.constants = ConstantVars()
     self.relatedDocs = []
     self.notRelatedDocs = []
     self.relatedDocsPos = []
     self.notRelatedDocsPos = []
     self.notRelatedCounts = 0
Ejemplo n.º 5
0
    def get_query_termList(query):
        wordFormer = FormWords()
        constants = ConstantVars()
        query = wordFormer.normalize(query)
        query_tokens = wordFormer.tokenize(query)
        for token in query_tokens:
            if token in constants.punctuations(
            ) or token in constants.StopWords():
                query_tokens.remove(token)
        query_tokens = wordFormer.uniform(query_tokens)
        # postaged_tokens = wordFormer.posTagging(query_tokens)
        stemmed_tokens = wordFormer.stemmWords(query_tokens)
        lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens)

        lemmatized_tokens = list(filter(lambda a: a != '"', lemmatized_tokens))
        return lemmatized_tokens
Ejemplo n.º 6
0
#
#     def lemmatize(self, word, pos=''):
#         if not pos and word in self.words:
#             return word
#
#         if (not pos or pos == 'V') and word in self.verbs:
#             return self.verbs[word]
#
#         if pos.startswith('AJ') and word[-1] == 'ی':
#            + with_nots(present_simples) + with_nots(
#                 present_imperfects) + present_subjunctives + present_not_subjunctives + imperatives)
from DataLayer.constants import ConstantVars
from BusinessLayer.textOperations import FormWords

wordFormer = FormWords()
constants = ConstantVars()
query_tokens = wordFormer.tokenize("شفاف سازی")
print('query tokens')
print(query_tokens)
postaged_tokens = wordFormer.posTagging(query_tokens)
print(postaged_tokens)
stemmed_tokens = wordFormer.stemmWords(query_tokens, len(query_tokens))
print(stemmed_tokens)
lemmatized_tokens = wordFormer.lemmatizeWords(stemmed_tokens, postaged_tokens,
                                              len(query_tokens))
print(lemmatized_tokens)

for token in lemmatized_tokens:
    if token in constants.punctuations() or token in constants.StopWords():
        lemmatized_tokens.remove(token)
print(lemmatized_tokens)
Ejemplo n.º 7
0
class Query:
    def __init__(self, i):
        self.wordFormer = FormWords()
        self.indexTasks = i
        self.constants = ConstantVars()
        self.relatedDocs = np.array([dict() for i in range(100)])
        self.notRelatedDocs = np.array([dict() for i in range(100)])
        self.notRelatedCounts = 0

    def processQuery(self, query):
        query_tokens = self.wordFormer.tokenize(query)
        for token in query_tokens:
            if token in self.constants.punctuations(
            ) or token in self.constants.StopWords():
                query_tokens.remove(token)
        print('query tokens')
        print(query_tokens)
        postaged_tokens = self.wordFormer.posTagging(query_tokens)
        stemmed_tokens = self.wordFormer.stemmWords(query_tokens,
                                                    len(query_tokens))
        lemmatized_tokens = self.wordFormer.lemmatizeWords(
            stemmed_tokens, postaged_tokens, len(query_tokens))
        i = j = 0
        k = -1
        not_include = False
        order = False
        orderTokens = [[] for i in range(5)]
        for token in lemmatized_tokens:
            if token == "\"" and order == False:
                k += 1
                order = True
                continue
            if token == "\"" and order == True:
                order = False
                continue
            if order:
                orderTokens[k].append(token)
                continue
            if token == "!":
                not_include = True
                self.notRelatedCounts += 1
                continue
            if not_include:
                self.notRelatedDocs[j] = self.indexTasks.getRelatedDocs(token)
                not_include = False
            else:
                self.relatedDocs[i] = self.indexTasks.getRelatedDocs(token)
                i += 1
        print('related docs')
        print(self.relatedDocs)
        related_result, relatedPos = self.merge(self.relatedDocs, i)
        docs = np.array([dict() for i in range(10)])
        doc_pos = np.array([dict() for i in range(10)])
        j = 0
        if related_result != []:
            docs[j] = related_result
            doc_pos[j] = relatedPos
            j += 1
        for i in range(0, k - 1):
            phrase_container, phrase_pos = self.phraseContainerDocs(
                orderTokens[i])
            docs[j] = phrase_container
            doc_pos[j] = phrase_pos
            print('phrase')
            print(phrase_container)
            print(phrase_pos)
            j += 1
        final_result, final_pos = self.finalMerge(docs, doc_pos, j)
        relateds_and_not_unrelateds, related_pos = self.notMerge(
            final_result, final_pos)
        print(relateds_and_not_unrelateds)
        print(related_pos)
        return relateds_and_not_unrelateds, related_pos

    def merge(self, docs, len):
        answer = []
        postingAns = []
        if len == 0:
            return [], []
        elif len == 1:
            return list(docs[0].keys()), list(docs[0].values())
        else:
            p2 = list(docs[0].keys())
            postings2 = []
            for docID in p2:
                postings2.append(docs[0][docID])
            i = 1
            while i < len:
                p1 = list(docs[i].keys())
                postings1 = []
                for docID in p1:
                    postings1.append(docs[i][docID])
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        answer.append(p1[0])
                        postingAns.append(postings1[0] + postings2[0])
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        postings1.remove(postings1[0])
                        postings2.remove(postings2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        postings1.remove(postings1[0])
                    else:
                        p2.remove(p2[0])
                        postings2.remove(postings2[0])
                p2 = answer
                postings2 = postingAns
        print('docc')
        print(answer)
        print(postingAns)
        return answer, postingAns

    def finalMerge(self, docs, docPos, length):
        answer = []
        docPosAns = []
        if length == 0:
            return [], []
        elif length == 1:
            return list(docs[0]), list(docPos[0])
        else:
            p2 = list(docs[0])
            docPos2 = list(docPos[0])
            i = 1
            while i < length:
                p1 = list(docs[i])
                docPos1 = list(docPos[i])
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        answer.append(p1[0])
                        docPosAns.append(docPos1[0] + docPos2[0])
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        docPos1.remove(docPos1[0])
                        docPos2.remove(docPos2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        docPos1.remove(docPos1[0])
                    else:
                        p2.remove(p2[0])
                        docPos2.remove(docPos2[0])
                p2 = answer
                docPos2 = docPosAns
        print('docc and double quote')
        print(answer)
        print(docPosAns)
        return answer, docPosAns

    def notMerge(self, relatedDocs, relatedPos):
        print('no relate')
        print(self.notRelatedDocs)
        answer = []
        postingAns = []
        if self.notRelatedCounts == 0:
            if len(relatedDocs) != 0:
                return relatedDocs, list(relatedPos)
            else:
                return [], []
        else:
            p1 = relatedDocs
            posting1 = relatedPos
            i = 0
            while i < self.notRelatedCounts:
                p2 = list(self.notRelatedDocs[i].keys())
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        p1.remove(p1[0])
                        posting1.remove(posting1[0])
                        p2.remove(p2[0])
                    elif p1[0] < p2[0]:
                        answer.append(p1[0])
                        postingAns.append(posting1[0])
                        posting1.remove(posting1[0])
                        p1.remove(p1[0])
                    else:
                        p2.remove(p2[0])
        for p in p1:
            answer.append(p)
        for posting in posting1:
            postingAns.append(posting)
        print('finall docc')
        return answer, postingAns

    def phraseContainerDocs(self, pharase):
        # to numbers of pharase length
        docs = np.array([dict() for i in range(10)])
        i = 0
        for p in pharase:
            docs[i] = self.indexTasks.getRelatedDocs(p)
            i += 1
        answer = []
        answer_posting = [[] for i in range(50)]
        length = len(docs)
        if length == 0:
            return [], []
        elif length == 1:
            return list(docs[0].keys()), list(docs[0].values())
        else:
            p2 = list(docs[0].keys())
            posting2 = list(docs[0].values())
            i = 1
            index = -1
            while i < length:
                p1 = list(docs[i].keys())
                posting1 = list(docs[i].values())
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        for posting in posting2[0]:
                            if (posting + 1) in posting1[0]:
                                if p1[0] not in answer:
                                    answer.append(p1[0])
                                    index += 1
                                answer_posting[index].append(posting + 1)
                        # print({p1[0] : docs[i - 1][p1[0]]})
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        posting1.remove(posting1[0])
                        posting2.remove(posting2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        posting1.remove(posting1[0])
                    else:
                        p2.remove(p2[0])
                        posting2.remove(posting2[0])
                p2 = answer
                posting2 = answer_posting
        print('double qoute')
        print(answer)
        print(answer_posting)
        return answer, answer_posting
Ejemplo n.º 8
0
class Index:
    def __init__(self):
        self.input = FileInOut()
        self.wordFormer = FormWords()
        self.constants = ConstantVars()
        self.dictionary = dict()
        self.posting_list = np.array([dict() for j in range(150000)])
        self.dicIndex = 0
        self.docIndex = 0
        self.c = 0

    def Filter(self, string, substr):
        return [
            str if not any(sub == str for sub in substr) else '**'
            for str in string
        ]

    def makeDic(self, value, j):
        if value not in self.dictionary.keys() and value != '**':
            # print(self.dicIndex)
            # print(value)
            if '\n' in value:
                pass
            else:
                self.dictionary[value] = 1
                self.input.writeDic([value])
                self.posting_list[self.dicIndex][self.docIndex] = [j]
                self.dicIndex += 1
        elif value in self.dictionary.keys() and value != '**':
            if self.docIndex in self.posting_list[list(
                    self.dictionary.keys()).index(value)].keys():
                self.posting_list[list(self.dictionary.keys()).index(value)][
                    self.docIndex].append(j)
            else:
                self.posting_list[list(
                    self.dictionary.keys()).index(value)][self.docIndex] = [j]

    def indexData(self):
        for n in range(15):
            data = self.input.readData('ir-news-' + str(n) + '.csv')
            for d in data["content"]:
                print(self.docIndex)
                self.docIndex += 1
                d = self.cleanContent(d)
                d = self.wordFormer.normalize(d)
                tokens = self.wordFormer.tokenize(d)
                self.c += len(tokens)
                tokens = list(filter(lambda a: a != '\n', tokens))
                tokens = self.wordFormer.uniform(tokens)
                # postaged_tokens = self.wordFormer.posTagging(tokens)
                stemmed_tokens = self.wordFormer.stemmWords(tokens)
                lemmatized_tokens = self.wordFormer.lemmatizeWords(
                    stemmed_tokens)
                lemmatized_tokens = self.Filter(
                    lemmatized_tokens,
                    self.constants.punctuations() +
                    ['\"', '\"', '!', '', '\n'] + self.constants.StopWords())
                list(
                    map(self.makeDic, lemmatized_tokens,
                        [i for i in range(0, len(lemmatized_tokens))]))
            print('doc' + str(n) + ': ' + str(self.docIndex))
        # for i in range(len(list(self.dictionary.keys()))):
        #     print(i)
        #     print(list(self.dictionary.keys()).pop(i))
        for i in range(0, len(self.posting_list)):
            self.input.writeDocID(self.posting_list[i])
            self.input.writePostingList([
                self.stringmaker(self.posting_list[i][key])
                for key in self.posting_list[i].keys()
            ])
        print('number of tokens')
        print(self.c)
        print(time.time())

    def getRelatedDocs(self, token):
        if token in self.dictionary:
            return self.posting_list[np.where(self.dictionary == token)][0]
        else:
            return {}

    def cleanContent(self, raw):
        cleaner = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
        cleanText = re.sub(cleaner, ' ', raw)
        return cleanText

    def stringmaker(self, list):
        stri = ''
        for i in list:
            stri = stri + str(i) + ' '
        return stri
Ejemplo n.º 9
0
class Tokenizer:
    def __init__(self):
        self.alphabet = [
            'پ', 'چ', 'ج', 'ح', 'خ', 'ه', 'ع', 'غ', 'ف', 'ق', 'ث', 'ص', 'ض',
            'گ', 'ک', 'م', 'ن', 'ت', 'ا', 'ل', 'ب', 'آ', 'ی', 'س', 'ش', 'و',
            'ئ', 'د', 'ذ', 'ر', 'ز', 'ط', 'ظ', 'أ', 'ژ', '\u200c', 'ُ', 'ّ',
            'ة', 'ۀ', 'ؤ', 'ء', 'إ'
        ]
        self.constants = ConstantVars()

        self.after_verbs = {'ام', 'ای', 'ایم', 'اید', 'اند'}

        self.before_verbs = {'می', 'نمی'}

        self.suffixes = {
            'ی', 'ای', 'ها', 'های', 'تر', 'تری', 'ترین', 'گر', 'گری', 'ام',
            'ات', 'اش'
        }

        self.expression = []
        self.expression.extend([
            ('علی ای حال', 'علی\u200cای\u200cحال'),
            ('بنا بر این', 'بنابراین'),
            ('بنابر این', 'بنابراین'),
            ('مع ذالک', 'مع\u200cذالک'),
            ('فی ما بین', 'فی\u200cمابین'),
            ('فی مابین', 'فی\u200cمابین'),
            ('چنان چه', 'چنان\u200cچه'),
            ('در واقع', 'درواقع'),
            ('فی کل حال', 'فی\u200cکل\u200cحال'),
        ])
        self.expression = compile_patterns(self.expression)

    def word_tokenize(self, sentence):
        sentence = self.expression_corrector(sentence)
        splits = sentence.split(' ')
        sentence = ' '.join(self.token_spacing(splits))
        splits = sentence.split(' ')
        split = []
        i = 0
        j = 0
        for s in splits:
            if s != '':
                string = ''
                string2 = ''
                for w in s:
                    if w in self.alphabet:
                        string = string + w
                    else:
                        string2 = string2 + w
                if s[0] in self.alphabet:
                    if string != '':
                        split.append(string)
                    if string2 != '':
                        string2 = re.sub(r'(.)\1+', r'\1', string2, re.UNICODE)
                        count = 0
                        for st in string2:
                            if st in self.constants.punctuations() + [
                                    '\"', '!'
                            ]:
                                count += 1
                        if count == len(string2):
                            for st in string2[:len(string2)]:
                                split.append(st)
                        else:
                            split.append(string2)
                else:
                    if string2 != '':
                        string2 = re.sub(r'(.)\1+', r'\1', string2, re.UNICODE)
                        count = 0
                        for st in string2:
                            if st in self.constants.punctuations() + [
                                    '\"', '!'
                            ]:
                                count += 1
                        if count == len(string2):
                            for st in string2[:len(string2)]:
                                split.append(st)
                        else:
                            split.append(string2)
                    if string != '':
                        split.append(string)
        return split

    def token_spacing(self, tokens):
        result = []
        for t, token in enumerate(tokens):
            joined = False
            token_pair = ''
            if result:
                token_pair = result[-1] + '\u200c' + token

                if token in self.suffixes:
                    joined = True

                elif result[-1] in self.before_verbs:
                    joined = True

                elif token in self.after_verbs:
                    joined = True

            if joined:
                result.pop()
                result.append(token_pair)
            else:
                result.append(token)

        return result

    def expression_corrector(self, text):
        for pattern, repl in self.expression:
            text = pattern.sub(repl, text)
        return text
Ejemplo n.º 10
0
class QueryProc:
    def __init__(self):
        self.input = FileInOut()
        self.Dic = self.input.readDic()
        self.DocID_file = self.input.readDocID()
        self.posting_file = self.input.readPostingList()
        self.wordFormer = FormWords()
        self.constants = ConstantVars()
        self.relatedDocs = []
        self.notRelatedDocs = []
        self.relatedDocsPos = []
        self.notRelatedDocsPos = []
        self.notRelatedCounts = 0

    def initializing(self, query):
        print(query)
        query = self.wordFormer.normalize(query)
        print(query)
        query_tokens = self.wordFormer.tokenize(query)
        for token in query_tokens:
            if token in self.constants.punctuations() or token in self.constants.StopWords():
                query_tokens.remove(token)
        query_tokens = self.wordFormer.uniform(query_tokens)
        # postaged_tokens = self.wordFormer.posTagging(query_tokens)
        stemmed_tokens = self.wordFormer.stemmWords(query_tokens)
        lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens)
        i = j = 0
        k = 0
        not_include = False
        order = False
        orderTokens = [[] for i in range(5)]
        for token in lemmatized_tokens:
            print(token)
            if token == "«" and order == False:
                print('first')
                k += 1
                order = True
                continue
            if token == "»" and order == True:
                print('second')
                order = False
                continue
            if order:
                orderTokens[k - 1].append(token)
                continue
            if token == "!":
                not_include = True
                self.notRelatedCounts += 1
                continue
            if not_include:
                self.notRelatedDocs.append(self.getRelatedSavedDocs(token))
                self.notRelatedDocsPos.append(self.getRelatedSavedpos(token))
                not_include = False
            print('order')
            print(order)
            if not not_include and not order:
                print('hahahaha')
                self.relatedDocs.append(self.getRelatedSavedDocs(token))
                self.relatedDocsPos.append(self.getRelatedSavedpos(token))

            # related_result, relatedPos = self.merge(self.relatedDocs, i)
        related_result = []
        relatedPos = []
        for res in range(len(self.relatedDocs)):
            related_result = related_result + self.relatedDocs[res]
            relatedPos = relatedPos + self.relatedDocsPos[res]
        related_result = list(set(related_result))
        relatedPos = relatedPos[:len(related_result)]
        docs = []
        doc_pos = []
        j = 0
        if related_result != []:
            docs.append(related_result)
            doc_pos.append(relatedPos)
            j += 1
        for i in range(0, k):
            phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i])
            docs.append(phrase_container)
            doc_pos.append(phrase_pos)
            j += 1
        final_result, final_pos = self.finalMerge(docs, doc_pos, j)

        relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos)
                # i += 1
        return relateds_and_not_unrelateds,related_position

    def merge_common_docs(self, common_list, docList1, docList2, indexList1, indexList2):
        for doc in common_list:
            i1 = docList1.index(doc)
            i2 = docList2.index(doc)
            docList2.pop(i2)
            indexList1[i1] = indexList1[i1] + indexList2.pop(i2)
        indexList1 = indexList1 + indexList2
        docList1 = docList1 + docList2
        return indexList1, docList1

    def similarity_merge(self, docLists, indexLists):
        if len(docLists) == 0:
            return None, None
        docs = docLists.pop(0)
        indexes = list(filter(lambda n: n != [], indexLists.pop(0)))
        if len(docLists) == 0:
            return docs, indexes
        for doc in docLists:
            i = docLists.index(doc)
            doci = docLists.pop(i)
            dociPos = list(filter(lambda n: n != [], indexLists.pop(i)))
            common = list(set(doci) & set(docs))
            indexes, docs = self.merge_common_docs(common, docs, doci, indexes, dociPos)
        return docs, indexes

    def processQueryBySimilarity(self, query):
        print('queryyy')
        print(query)
        docList, indexList = self.initializing(query)
        # related_result, related_pos = self.relatedDocs, self.relatedDocsPos
        # j = 0
        # if related_result != []:
        #     j += 1
        # for i in range(0, k):
        #     phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i])
        #     related_result.append(phrase_container)
        #     related_pos.append(phrase_pos)
        #     j += 1
        # relateds_and_not_unrelateds, related_position = self.finalMerge(related_result, related_pos, j)
        # # relateds_and_not_unrelateds, related_position = self.similarity_merge(related_result, related_pos)
        # docList, indexList = self.notMerge(relateds_and_not_unrelateds, related_position)
        return docList, indexList

    def processQuery(self, query):
        query = self.wordFormer.normalize(query)
        query_tokens = self.wordFormer.tokenize(query)
        for token in query_tokens:
            if token in self.constants.punctuations() or token in self.constants.StopWords():
                query_tokens.remove(token)
        query_tokens = self.wordFormer.uniform(query_tokens)
        # postaged_tokens = self.wordFormer.posTagging(query_tokens)
        stemmed_tokens = self.wordFormer.stemmWords(query_tokens)
        lemmatized_tokens = self.wordFormer.lemmatizeWords(stemmed_tokens)
        i = j = 0
        k = 0
        not_include = False
        order = False
        orderTokens = [[] for i in range(5)]
        for token in lemmatized_tokens:
            if token == "\"" and order == False:
                k += 1
                order = True
                continue
            if token == "\"" and order == True:
                order = False
                continue
            if order:
                orderTokens[k - 1].append(token)
                continue
            if token == "!":
                not_include = True
                self.notRelatedCounts += 1
                continue
            if not_include:
                self.notRelatedDocs.append(self.getRelatedSavedDocs(token))
                self.notRelatedDocsPos.append(self.getRelatedSavedpos(token))
                not_include = False
            else:
                self.relatedDocs.append(self.getRelatedSavedDocs(token))
                self.relatedDocsPos.append(self.getRelatedSavedpos(token))
                i += 1
        # print('related docs')
        # print(self.relatedDocs)
        related_result, relatedPos = self.merge(self.relatedDocs, i)
        docs = []
        doc_pos = []
        j = 0
        if related_result != []:
            docs.append(related_result)
            doc_pos.append(relatedPos)
            j += 1
        for i in range(0, k):
            phrase_container, phrase_pos = self.phraseContainerDocs(orderTokens[i])
            docs.append(phrase_container)
            doc_pos.append(phrase_pos)
            j += 1
        final_result, final_pos = self.finalMerge(docs, doc_pos, j)
        # print("self.notRelatedCounts")
        # print(self.notRelatedCounts)
        # print('no relate')
        # print(self.notRelatedDocs)
        relateds_and_not_unrelateds, related_position = self.notMerge(final_result, final_pos)
        # for i in range(len(related_pos)):
        #     related_pos[i] = related_pos[i]
        # print(relateds_and_not_unrelateds)
        # print(related_position)
        return relateds_and_not_unrelateds, related_position

    def merge(self, docs, leng):
        answer = []
        postingAns = []
        if leng == 0:
            return [], []
        elif leng == 1:
            return docs[0], self.relatedDocsPos[0]
        else:
            p2 = docs[0]
            postings2 = []
            for j in range(len(p2)):
                postings2.append(self.relatedDocsPos[0][j])
            i = 1
            while i < leng:
                p1 = docs[i]
                postings1 = []
                for j in range(len(p1)):
                    postings1.append(self.relatedDocsPos[i][j])
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        answer.append(p1[0])
                        postingAns.append(postings1[0] + postings2[0])
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        postings1.remove(postings1[0])
                        postings2.remove(postings2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        postings1.remove(postings1[0])
                    else:
                        p2.remove(p2[0])
                        postings2.remove(postings2[0])
                p2 = answer
                postings2 = postingAns
        print('docc')
        print(answer)
        print(postingAns)
        return answer, postingAns

    def finalMerge(self, docs, docPos, length):
        answer = []
        docPosAns = []
        if length == 0:
            return [], []
        elif length == 1:
            return list(docs[0]), list(docPos[0])
        else:
            p2 = list(docs[0])
            docPos2 = list(docPos[0])
            i = 1
            while i < length:
                p1 = list(docs[i])
                docPos1 = list(docPos[i])
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        answer.append(p1[0])
                        docPosAns.append(docPos1[0] + docPos2[0])
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        docPos1.remove(docPos1[0])
                        docPos2.remove(docPos2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        docPos1.remove(docPos1[0])
                    else:
                        p2.remove(p2[0])
                        docPos2.remove(docPos2[0])
                p2 = answer
                docPos2 = docPosAns
        # print('docc and double quote')
        # print(answer)
        # print(docPosAns)
        return answer, docPosAns

    def notMerge(self, relatedDocs, relatedPos):
        print('no relate')
        print(self.notRelatedDocs)
        answer = []
        postingAns = []
        if self.notRelatedCounts == 0:
            if len(relatedDocs) != 0:
                return relatedDocs, list(relatedPos)
            else:
                return [], []
        else:
            p1 = relatedDocs
            posting1 = relatedPos
            i = 0
            while i < self.notRelatedCounts:
                p2 = self.notRelatedDocs[i]
                i += 1
                while p1 != [] and p2 != []:
                    if p1[0] == p2[0]:
                        p1.remove(p1[0])
                        posting1.remove(posting1[0])
                        p2.remove(p2[0])
                    elif p1[0] < p2[0]:
                        answer.append(p1[0])
                        postingAns.append(posting1[0])
                        posting1.remove(posting1[0])
                        p1.remove(p1[0])
                    else:
                        p2.remove(p2[0])
        for p in p1:
            answer.append(p)
        for posting in posting1:
            postingAns.append(posting)
        print('finall docc')
        return answer, postingAns

    def phraseContainerDocs(self, pharase):
        # to numbers of pharase length
        docs = []
        docsPos = []
        for p in pharase:
            docs.append(self.getRelatedSavedDocs(p))
            docsPos.append(self.getRelatedSavedpos(p))
        answer = []
        answer_posting = [[] for k in range(50)]
        length = len(docs)
        if length == 0:
            return [], []
        elif length == 1:
            # print(docs[0])
            return docs[0], docsPos[0]
        else:
            p2 = docs[0]
            posting2 = docsPos[0]
            i = 1
            while i < len(pharase):
                index = -1
                answer = []
                answer_posting = [[] for k in range(50)]
                p1 = docs[i]
                posting1 = docsPos[i]
                i += 1
                while (p1 != [] and p2 != []):
                    if p1[0] == p2[0]:
                        for posting in posting2[0]:
                            if (posting + 1) in posting1[0]:
                                if p1[0] not in answer:
                                    answer.append(p1[0])
                                    index += 1
                                    answer_posting[index].append(posting + 1)
                        # print({p1[0] : docs[i - 1][p1[0]]})
                        p1.remove(p1[0])
                        p2.remove(p2[0])
                        posting1.remove(posting1[0])
                        posting2.remove(posting2[0])
                    elif p1[0] < p2[0]:
                        p1.remove(p1[0])
                        posting1.remove(posting1[0])
                    else:
                        p2.remove(p2[0])
                        posting2.remove(posting2[0])
                p2 = answer
                # print('ans')
                # print(answer)
                # print(answer_posting)
                posting2 = answer_posting
        # print('double qoute')
        # print(answer)
        # print(answer_posting)
        return answer, answer_posting

    def getRelatedSavedDocs(self, token):
        i = 0
        if token in self.Dic:
            # print(self.Dic.index(token))
            posting = list(map(int, self.DocID_file[self.Dic.index(token)]))
            i += 1
            print(posting)
            return posting
        return []

    def getRelatedSavedpos(self, token):
        i = 0
        if token in self.Dic:
            # print(self.Dic.index(token))
            posting = [list(map(int, self.posting_file[self.Dic.index(token)][j].split(' '))) for j in
                       range(len(self.posting_file[self.Dic.index(token)]))]
            i += 1
            return posting
        return []