Ejemplos de cleaning en Python, ejemplos de general.cleaning en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: ranking_nlp.py Proyecto: rahulchawla1803/customized-WSE-display

def rank_ngram(query):
    ngram_analyse = []
    clean_query = cleaning(query)
    length_query = len(clean_query)
    if length_query is 1:
        return {}, {}

    docs = []

    for line in open('ngram.json', 'r'):
        docs.append(json.loads(line))

    rank_sum_dict = {}
    for doc in docs:
        rank_sum_dict[doc['url']] = 0

    query_gram2 = []
    for index, word in enumerate(clean_query):
        if index is length_query - 1:
            break
        two_word = word + ' ' + clean_query[index + 1]
        query_gram2.append(two_word)

    #print(query_gram2)

    query_gram3 = []
    for index, word in enumerate(clean_query):
        if index is length_query - 2:
            break
        three_word = word + ' ' + clean_query[index +
                                              1] + ' ' + clean_query[index + 2]
        query_gram3.append(three_word)

    #print(query_gram3)

    for doc in docs:
        for query_word_gram2 in query_gram2:
            for doc_word in doc['gram2']:
                if NGram.compare(query_word_gram2, doc_word) >= 0.5:
                    rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 1
                    ngram_analyse.append(doc_word)

    ngram_analyse = list(set(ngram_analyse))

    for doc in docs:
        for query_word_gram3 in query_gram3:
            for doc_word in doc['gram3']:
                if NGram.compare(query_word_gram3, doc_word) >= 0.5:
                    rank_sum_dict[doc['url']] = rank_sum_dict[doc['url']] + 3
                    ngram_analyse.append(doc_word)

    ngram_analyse = list(set(ngram_analyse))

    #print(rank_sum_dict)
    rank_sum_dict_unsorted = {}
    for key, value in rank_sum_dict.items():
        if value > 0:
            rank_sum_dict_unsorted[key] = value

    return rank_sum_dict_unsorted, ngram_analyse

Ejemplo n.º 2

0

Mostrar archivo

Archivo: query_module.py Proyecto: rahulchawla1803/customized-WSE-display

def query_structure(search_query):

    query_synonym = []
    query_suggestion = []
    word_synonyms = []
    word_suggestions=[]

    synonym_docs=[]
    for line in open('synonym1500.json', 'r'):
        synonym_docs.append(json.loads(line))


    #print(search_query)
    clean_query = cleaning(search_query)

    for word in clean_query:
        for doc in synonym_docs:
            if word in doc['words']:
                query_synonym = query_synonym + doc['words']

    #print(query_synonym)


    for word in clean_query:
        word_suggestions = dictionary_spelling.suggest(word)
        query_suggestion = query_suggestion+word_suggestions


    clean_query_root = []
    for word in clean_query:
        root_word = stem(word)
        clean_query_root.append(root_word)

    query_synonym_root = []
    for word in query_synonym:
        root_word = stem(word)
        query_synonym_root.append(root_word)


    query_suggestion_root = []
    for word in query_suggestion:
        root_word = stem(word)
        query_suggestion_root.append(root_word)

    return clean_query_root, query_synonym_root, query_suggestion_root

Ejemplo n.º 3

0

Mostrar archivo

Archivo: test_search_ranking_title_IR2.py Proyecto: rahulchawla1803/customized-web-search-engine

def search(search_query):

    query_synonym = []
    query_suggestion = []
    word_synonyms = []
    word_suggestions = []

    synonym_docs = []
    for line in open('synonym.json', 'r'):
        synonym_docs.append(json.loads(line))

    print(search_query)
    clean_query = cleaning(search_query)

    for word in clean_query:
        for doc in synonym_docs:
            if word in doc['words']:
                query_synonym = query_synonym + doc['words']

    print(query_synonym)

    for word in clean_query:
        word_suggestions = dictionary_spelling.suggest(word)
        query_suggestion = query_suggestion + word_suggestions

    clean_query_root = []
    for word in clean_query:
        root_word = stem(word)
        clean_query_root.append(root_word)

    query_synonym_root = []
    for word in query_synonym:
        root_word = stem(word)
        query_synonym_root.append(root_word)

    query_suggestion_root = []
    for word in query_suggestion:
        root_word = stem(word)
        query_suggestion_root.append(root_word)

    title_rank = {}
    client = MongoClient()
    db = client.webSE
    docs = db.indexed_repo_title.find({})

    for doc in docs:
        title_match_value = 0
        for index, title_word_root in enumerate(doc['title_root']):

            for query_word_root in clean_query_root:
                if title_word_root == query_word_root:
                    title_match_value = title_match_value + doc[
                        'idf_word_list'][index]

            for query_word_synonym_root in query_synonym_root:
                if title_word_root == query_word_synonym_root:
                    title_match_value = title_match_value + 0.75 * doc[
                        'idf_word_list'][index]

            for query_word_suggestion_root in query_suggestion_root:
                if title_word_root == query_word_suggestion_root:
                    title_match_value = title_match_value + 0.5 * doc[
                        'idf_word_list'][index]

        title_rank[doc['url']] = title_match_value

    title_sorted_rank_dict = {}

    for key, value in sorted(title_rank.items(),
                             key=operator.itemgetter(1),
                             reverse=True):
        if value > 0:
            title_sorted_rank_dict[key] = value

    title_sorted_rank_list = []
    for key in title_sorted_rank_dict:
        title_sorted_rank_list.append(key)

    docs = db.indexed_repo_title.find({'url': {'$in': title_sorted_rank_list}})
    link_title_sorted_dict = {}

    doc_dict = {}
    for doc in docs:
        doc_dict[doc['url']] = doc['title']

    for link in title_sorted_rank_list:
        for (key, value) in doc_dict.items():
            if link == key:
                link_title_sorted_dict[key] = value
    print(link_title_sorted_dict)
    return link_title_sorted_dict


#search("arrides")

Ejemplo n.º 4

0

Mostrar archivo

def search(search_query):
    dict = enchant.Dict("en_UK")
    dictionary = PyDictionary()
    print(search_query)
    clean_query = cleaning(search_query)
    synonyms = []
    synonyms_final = []
    synonyms_final_root = []
    for word in clean_query:
        if dict.check(word):
            synonyms = dictionary.synonym(word)
            synonyms_final = synonyms_final + synonyms

    length_clean_query = len(clean_query)
    for word in clean_query[:length_clean_query + 1]:
        suggestions = dict.suggest(word)
        for i in suggestions:
            clean_query.append(i)
    print(clean_query)
    clean_query_root = []

    for word in clean_query:
        root_word = stem(word)
        clean_query_root.append(root_word)

    for word in synonyms_final:
        root_word = stem(word)
        synonyms_final_root.append(root_word)

    clean_query_root = clean_query_root + synonyms_final_root

    title_rank = {}
    keyword_rank = {}
    client = MongoClient()
    db = client.webSE
    docs = db.keyword.find({})

    for doc in docs:
        title_match_value = 0
        for index, title_keyword_root in enumerate(doc['title_root']):
            for query_keyword_root in clean_query_root:
                #print("title_keyword : "+ title_keyword+",  query_keyword : "+query_keyword)
                if title_keyword_root == query_keyword_root:
                    #print("check loop")
                    title_match_value = title_match_value + doc[
                        'title_relative'][index]
        #print(title_match_value)

        title_rank[doc['url']] = title_match_value

    print(title_rank)

    docs = db.keyword.find({})

    for doc in docs:
        keyword_match_value = 0
        for index, keyword_root in enumerate(doc['keyword_root']):
            for query_keyword_root in clean_query_root:

                if keyword_root == query_keyword_root:
                    keyword_match_value = keyword_match_value + doc[
                        'keyword_relative'][index]

        keyword_rank[doc['url']] = keyword_match_value

    #print(keyword_rank)

    combined_rank_dict = {}
    for key, value in title_rank.items():
        combined_rank_dict[key] = keyword_rank[key] + value

    #print(combined_rank_dict)

    combined_rank_dict_sorted = {}
    for key, value in sorted(combined_rank_dict.items(),
                             key=operator.itemgetter(1),
                             reverse=True):
        if value > 0:
            combined_rank_dict_sorted[key] = value

    #print(combined_rank_dict_sorted)

    combined_rank_sorted = []
    for key in combined_rank_dict_sorted:
        combined_rank_sorted.append(key)

    #print(combined_rank_sorted)

    docs = db.data.find({'url': {'$in': combined_rank_sorted}})
    final_link_title_dict = {}
    #print(combined_rank_sorted)

    doc_dict = {}
    for doc in docs:
        doc_dict[doc['url']] = doc['title']

    for link in combined_rank_sorted:
        for (key, value) in doc_dict.items():
            if link == key:
                final_link_title_dict[key] = value

    return final_link_title_dict


#search("")

Ejemplo n.º 5

0

Mostrar archivo

import nltk
from pymongo import MongoClient
from general import cleaning

client = MongoClient()
db = client.webSE
docs = db.data1500.find({})

for doc in docs[570:575]:
    content = doc['content']
    list_content = cleaning(content)
    str_content = ' '.join(list_content)
    text = nltk.word_tokenize(str_content)
    x = nltk.pos_tag(text)

    for i, j in x:
        if j is 'NNP' or j is 'NNPS':
            print(i, j)

Ejemplo n.º 6

0

Mostrar archivo

from pymongo import MongoClient
from general import cleaning
from stemming.porter2 import stem

client = MongoClient()
db = client.webSE

docs_count = db.data1500.find({}).count()
docs = db.data1500.find({})
title_combined_unique_root = []

for doc in docs:
    title = doc['title']
    title_clean = cleaning(title)
    title_clean_root = []

    for word in title_clean:
        root_word = stem(word)
        title_clean_root.append(root_word)

    title_clean_unique_root = list(set(title_clean_root))

    for word in title_clean_unique_root:
        title_combined_unique_root.append(word)

#print(title_combined_unique_root)

docs = db.data1500.find({})

for doc in docs:

Ejemplo n.º 7

0

Mostrar archivo

from ngram import NGram
from pymongo import MongoClient
from general import cleaning
from collections import Counter
from stemming.porter2 import stem

client = MongoClient()
db = client.webSE
docs = db.data1500.find({})

#print("check1")
for doc in docs:
    content = doc['content']
    content_clean = cleaning(content)
    content_clean_root = []
    for word in content_clean:
        root_word = stem(word)
        content_clean_root.append(root_word)
        #print("check2")

    object_top1 = Counter(content_clean_root)
    top_root_object = object_top1.most_common(5)
    top_root_words = []

    for key, val in top_root_object:
        top_root_words.append(key)
        #print("check3")

    gram2 = []
    gram3 = []
    len_content_clean = len(content_clean)

Ejemplo n.º 8

0

Mostrar archivo

from pymongo import MongoClient
from collections import Counter
from general import cleaning
from stemming.porter2 import stem

client = MongoClient()
db = client.webSE

docs = db.data.find({})
#print(docs[0])
title_combined = []

for doc in docs:
    title = doc['title']
    title_clean = cleaning(title)
    for word in title_clean:
        title_combined.append(word)

total_len_title = len(title_combined)
#print(total_len_title)
#print(title_combined)

docs = db.data.find({})
for i in docs:
    content = i['content']
    title = i['title']

    title_clean = []
    content_clean = []
    keyword_relative = []
    title_relative = []