def proximity_search(k, terms):
    k += 1
    result = []
    proximities = []
    t1 = singularize(terms[0])
    t2 = singularize(terms[1])
    l1 = index[t1]
    l2 = index[t2]
    ptr1 = 0
    ptr2 = 0

    while ptr1 < len(l1) and ptr2 < len(l2):
        doc1 = l1[ptr1]
        doc2 = l2[ptr2]
        if (doc1["doc_id"] == doc2["doc_id"]):
            l = []
            pos_ptr1 = 0
            pos_ptr2 = 0
            pos_pairs = []
            while pos_ptr1 < len(doc1["positions"]):
                while pos_ptr2 < len(doc2["positions"]):
                    if abs(doc1["positions"][pos_ptr1] -
                           doc2["positions"][pos_ptr2]) <= k:
                        l.append(doc2["positions"][pos_ptr2])
                    elif doc2["positions"][pos_ptr2] > doc1["positions"][
                            pos_ptr1]:
                        break

                    pos_ptr2 += 1
                while l and abs(l[0] - doc1["positions"][pos_ptr1]) > k:
                    l.remove(l[0])
                for position in l:
                    pos_pairs.append([doc1["positions"][pos_ptr1], position])
                pos_ptr1 += 1

            if pos_pairs:
                result.append({
                    "doc_id": doc1["doc_id"],
                    "doc_name": doc_index[str(doc1["doc_id"])],
                    "doc_snippet": doc1["doc_snippet"],
                    "positions": doc1["positions"]
                })
                t1_pos = doc1["doc_snippet"].find(t1)
                t2_pos = doc1["doc_snippet"].find(t2)
                proximities.append(doc1["doc_snippet"][t1_pos:t2_pos +
                                                       len(t2)])

            ptr1 += 1
            ptr2 += 1
        elif doc1["doc_id"] < doc2["doc_id"]:
            ptr1 += 1
        else:
            ptr2 += 1

    return result, proximities
Example #2
0
def index_doc(word, doc_id, i, snippet):
    if word == '*':
        index['*'].append({"doc_id": doc_id})
    else:
        word = ''.join(filter(lambda x: x in printable, word))
        if (word not in stopwords):
            word = singularize(word)
            if (word and word in simple_index):
                if (doc_id not in simple_index[word]):
                    word_doc_ptr[word] += 1
                    index[word].append({
                        "doc_id": doc_id,
                        "doc_snippet": snippet,
                        "positions": [i],
                    })
                    simple_index[word].append(doc_id)
                else:
                    doc = index[word][word_doc_ptr[word]]
                    doc["positions"].append(i)
            else:
                simple_index[word] = [doc_id]
                index[word] = [{
                    "doc_id": doc_id,
                    "doc_snippet": snippet,
                    "positions": [i]
                }]
                word_doc_ptr[word] = 0
Example #3
0
def data_split(datas):  # 将读取的数据处理为字典列表
    result = {}
    id = {}
    name = {}
    for i in range(len(datas)):

        data = datas[i]
        recipe = {}
        # for data in datas:
        dst = {}
        # print(len(data))
        for index in range(2, len(data)):
            # print(index)
            dec = data[index].split('#')
            temp = dec[0]
            temp = re.sub('fresh|frozen|large|small|chunks', '', temp)  # 去掉部分无关紧要形容词
            temp = singularize(temp)
            if temp not in materials:
                materials.append(temp)
            dst[temp] = dec[1]
        # print(dst)
        result[i] = dst
        name[i] = data[1]
        id[data[1]] = data[0]
        # print(result)

    return result, name, id
Example #4
0
def emailWordsBagBayesian(words):
    emailWordsBagBayes = 0
    wordsCount = 0
    for word in words:
        if len(word) < 2: continue
        word = WordNetLemmatizer().lemmatize(word, 'v')
        word = singularize(word)
        emailWordsBagBayes += wordsBagBayesian(word)
        wordsCount += 1
    return emailWordsBagBayes / wordsCount if wordsCount != 0 else 0
Example #5
0
def updateWordsValue(emails, status):
    for email in emails:
        for word in email.split():
            if len(word) < 2: continue
            word = WordNetLemmatizer().lemmatize(word, 'v')
            word = singularize(word)
            if word not in wordsValue:
                wordsValue[word] = 0
            wordsValue[
                word] += 1 / hamsNumber if status == 'ham' else -1 / spamsNumber
Example #6
0
def updateWordsRepeat(emails, wordsRepeat, status):
    global wordsNumberOfHams, wordsNumberOfSpams
    for email in emails:
        for word in email.split():
            if len(word) < 2: continue
            word = WordNetLemmatizer().lemmatize(word, 'v')
            word = singularize(word)
            if word not in wordsRepeat:
                wordsRepeat[word] = 0
            wordsRepeat[word] += 1
            if status == 'ham': wordsNumberOfHams += 1
            else: wordsNumberOfSpams += 1
Example #7
0
def data_split(datas):  # 将读取的数据处理为字典列表
    result = {}
    id = {}
    name = {}
    t = {}
    for i in range(len(datas)):

        data = datas[i]
        recipe = {}
        # for data in datas:
        dst = {}
        # print(len(data))
        ingredient = {}
        for index in range(2, len(data)):
            # print(index)
            dec = data[index].split('#')
            a = dec[0]
            b = a
            b = singularize(b)
            temp = dec[0].lower()
            temp = re.sub('fresh|frozen|large|small|chunks', '',
                          temp)  # 去掉部分无关紧要形容词

            temp = singularize(temp)
            if temp not in materials:
                materials.append(temp)
            dst[temp] = dec[1]
            ingredient[dec[0]] = temp
            if a != b:
                t[a] = b
                t[b] = a
        # print(dst)
        result[data[0]] = dst
        name[data[0]] = ingredient
        id[data[0]] = data[1]
        # print(result)

    return result, name, id, t
Example #8
0
def setEmailsValue(emails):
    values = []
    for email in emails:
        value = 0
        wordsProcessNumber = 0
        for word in email.split():
            word = WordNetLemmatizer().lemmatize(word, 'v')
            word = singularize(word)
            if word not in wordsValue: continue
            value += wordsValue[word]
            wordsProcessNumber += 1
        value /= wordsProcessNumber if wordsProcessNumber != 0 else 1
        values.append(value)
    return values
Example #9
0
def data_split(datas):  # 将读取的数据处理为字典列表
    result = []
    for data in datas:
        dst = {}
        # print(len(data))
        for index in range(2, len(data)):
            # print(index)
            dec = data[index].split('#')
            temp = dec[0]
            temp = re.sub('fresh|frozen|large|small|chunks', '', temp)  # 去掉部分无关紧要形容词
            temp = singularize(temp)
            if temp not in materials:
                materials.append(temp)
            dst[temp] = dec[1]
        # print(dst)
        result.append(dst)
        # print(result)
    return result
Example #10
0
def query(query_str):
    query_str = query_str.split(" ")
    q = np.zeros((len(vocab)))
    for term in query_str:
        term = ''.join(filter(lambda x: x in printable, term))
        if (term not in stopwords):
            term = singularize(term)
            term = re.sub(r'ly$', r'', term)
            term = re.sub(r'ed$', r'', term)
            term = re.sub(r'ing$', r'', term)
            term = re.sub(r'nes$', r'', term)
            if len(term) >= 3:
                term_index = vocab.index(term)
                q[term_index] = idf[term]

    q /= np.linalg.norm(q)

    alpha = 0.001
    S = np.dot(D, q)
    idx = np.arange(S.size)[S >= alpha]
    res = list(map(int, sorted(idx[np.argsort(S[idx])] + 1)))

    return res
doc_tf = {}
idf = {}

for i in range(1, 51):
    with open(os.path.join(DATASET_DIR, str(i) + '.txt')) as file:
        words = file.read()
        words = re.sub(r'\n|--', r' ', words)
        words = re.sub(r'“|”|’|‘|;|,|!|:|\.|\?|\)|\(|\*', r'', words)
        words = words.lower()
        words = re.split(r" |-|\u2014", words)
        words = [word for word in words if word]

        for word in words:
            word = ''.join(filter(lambda x: x in printable, word))
            if (word not in stopwords):
                word = singularize(word)
                word = re.sub(r'ly$', r'', word)
                word = re.sub(r'ed$', r'', word)
                word = re.sub(r'ing$', r'', word)
                word = re.sub(r'nes$', r'', word)
                if len(word) >= 3:
                    if (i, word) not in doc_tf:
                        doc_tf[i, word] = 1

                        if word not in idf:
                            idf[word] = 1
                        else:
                            idf[word] += 1
                    else:
                        doc_tf[i, word] += 1
Example #12
0
def clean_job_resp_col(extract_job_data):
    stop_words = set(stopwords.words('english'))
    remove_punc = str.maketrans('', '', string.punctuation)
    extract_job_data['JobDescription'] = extract_job_data['JobDescription']\
        .apply(lambda x: ' '.join([singularize(word) for word in str(x).translate(remove_punc).split() if word.lower() not in (stop_words)]))
    print('done')
Example #13
0
    def __call__(self, document):
        if self.delete_punctuation_marks:
            c = re.compile('[{}]'.format(re.escape(string.punctuation)))
            document = c.sub('', document)

        if self.delete_numeral:
            c = re.compile('[{}]'.format(re.escape(string.digits)))
            document = c.sub('', document)

        document = document.lower()
        texts = document.split()

        if self.stop_words is not None:
            texts = [word for word in texts if word not in self.stop_words]

        words = []
        load.polyglot_path = os.path.join(os.getcwd(), 'polyglot_data')
        os.path.sep = '/'
        n = ['NOUN', 'PRON', 'PROPN']
        a = ['ADJ', 'ADP']
        v = ['VERB', 'ADV', 'AUX']
        #     initial_texts = []
        #     types = []
        #     for token in texts:
        #         pos_tag = Text(token, 'en').pos_tags[0][1]
        #         if pos_tag in n:
        #             initial_texts.append(en.singularize(token))
        #         elif pos_tag in a:
        #             initial_texts.append(WordNetLemmatizer().lemmatize(token, 'a'))
        #         elif pos_tag in v:
        #             initial_texts.append(en.lemma(token))
        #         else:
        #             initial_texts.append(token)
        #         types.append(pos_tag)
        #
        #     texts = initial_texts
        #
        # # if self.delete_single_words:
        # #     # self.set_words = set(texts).difference(self.set_words)
        # #     texts = list(set(texts).difference(self.set_words))
        # #     self.set_words.update(set(texts))
        #
        # return texts, types

        for token in texts:
            pos_tag = Text(token, 'en').pos_tags[0][1]
            if pos_tag in n:
                word = en.singularize(token)
            elif pos_tag in a:
                word = WordNetLemmatizer().lemmatize(token, 'a')
            elif pos_tag in v:
                word = en.lemma(token)
            else:
                word = token
            # print(token + " -> " + word)
            # if word == "ymy":  # TODO remove and fix 'ymy' bug
            #     word = 'your'
            word = (word, pos_tag)
            words.append(word)

        unique_words = list(set(words))
        occurrences = sorted([(x[0], x[1], words.count(x)) for x in unique_words], key=lambda y: y[2], reverse=True)
        if self.initial_form:
            return [w[0] for w in words], occurrences
        else:
            return texts, occurrences
def ans_data():
    print("ans_data")
    file = open('test.txt', 'r', encoding="UTF-8-sig")
    js = file.read()
    similar_test = json.loads(js)
    print(len(similar_test.keys()))
    file.close()
    data_test, name_test, id_test, t = test_data_split(get_data_test())
    data_train, name_train, id_train = train_data_split(train_data_read())
    data_train, material_index, material_sum, material_evg, material_th, material_count = get_material_information(
        data_train)
    material_sum, material_error = clean_data(data_train, material_index,
                                              material_sum, material_evg,
                                              material_th, material_count)
    similar_dec = {}

    for key in similar_test:
        temp = []
        for i in range(len(similar_test[key])):
            if (similar_test[key][i][1] >= 0.9) & (similar_test[key][i][0]
                                                   not in material_error):
                temp.append(similar_test[key][i][0])
        similar_dec[key] = temp
    ans = []
    for key in data_test:
        recipe = str(key) + "," + str(id_test[key])
        for ingredient in data_test[key]:
            temp = []
            flag = False
            for i in range(len(similar_dec[key])):
                if ingredient in data_train[similar_dec[key][i]].keys():
                    if data_train[similar_dec[key][i]][ingredient] != '':
                        flag = True
                        temp.append(
                            int(data_train[similar_dec[key][i]][ingredient]))

            if flag & len(temp) != 0:

                data_test[key][ingredient] = np.mean(temp)
            # elif ingredient in material_evg.keys():
            #     data_test[key][ingredient] = material_evg[ingredient]
            else:
                try:
                    data_test[key][ingredient] = material_evg[ingredient]
                except:
                    data_test[key][ingredient] = "Null"
            # recipe += ',' + str(name_test[key][ingredient]) + "#" + str(data_test[key][ingredient])
            recipe += ',' + str(ingredient) + "#" + str(
                data_test[key][ingredient])

        ans.append(recipe)

    file = open('./data/recipe2.csv', 'w', encoding='utf-8')
    for i in range(len(ans)):
        s = ans[i]
        s = s + '\n'  # 去除单引号,逗号,每行末尾追加换行符
        file.write(s)
    file.close()
    print("保存文件成功")
    result = []
    test_data = read(test_data_path)
    ans = readans('./data/recipe2.csv')
    for key in test_data:
        recipe = str(key)
        # print(key)
        for i in range(len(test_data[key])):
            # print("????????????????????????????????")
            print(key, test_data[key][i][0])

            ingredient = test_data[key][i][0].lower()
            temp = re.sub('fresh|frozen|large|small|chunks', '', ingredient)
            temp = singularize(temp)
            print(temp)
            # print(key, ans[key][temp])
            try:
                recipe += "," + str(test_data[key][i][0]) + "#" + str(
                    ans[key][temp])
            except:
                try:
                    # recipe += "," + str(test_data[key][i][0]) + "#" + str(ans[key][name_test[key.split(",")[0]][ingredient]])
                    recipe += "," + str(test_data[key][i][0]) + "#" + str(
                        ans[key][ingredient])
                except:
                    recipe += "," + str(test_data[key][i][0]) + "#" + str(
                        ans[key][ingredient + "®"])
        result.append(recipe)
    file = open('./data/recipe2.csv', 'w', encoding='utf-8')
    for i in range(len(result)):
        s = result[i]
        s = s + '\n'  # 去除单引号,逗号,每行末尾追加换行符
        file.write(s)
    file.close()
    print("保存文件成功")
def search(term):
    term = singularize(term)
    if term in index:
        return index[term], [term]
    else:
        return [], ""