def findFeature(i, j):
    dice_score = []
    cosine_score = []
    jaccard_score = []
    fuzzy_score = []
    sequence_matcher = []
    fmeasure = []
    word2vec_scores = []
    wordnet_score = []
    tfidf_cosine_score = []

    # find features
    dice_score.append(dice_recur(i, j))
    cosine_score.append(get_cosine(i, j))
    jaccard_score.append(is_ci_token_stopword_set_match(i, j))
    fuzzy_score.append(get_string_similarity(i, j))
    sequence_matcher.append(similar(i, j))
    # tf-idf cosine similarity
    tfidf_cosine_score.append(tfidf_cosine(i, j))
    rouge_score = calc_Rouge(i, j)
    surf_features = get_surface_features(i, j)
    fmeasure.append(FMeasure(j))
    word2vec_scores.append(word2vec_score(i, j))
    ner_overlap_score = get_NER_overlap_score(i, j)
    num_overlap_score = get_number_similarity(i, j)
    # no of significant words + summation of significance values
    significant_score = getsignificance(j)
    # get best similarity between words based on wordnet
    wordnet_score.append(get_wordnet_based_similarity(i, j))

    features = tuple(dice_score) + tuple(cosine_score) + tuple(
        jaccard_score) + tuple(fuzzy_score) + tuple(sequence_matcher) + tuple(
            tfidf_cosine_score) + rouge_score + tuple(
                fmeasure) + surf_features + tuple(word2vec_scores) + tuple(
                    wordnet_score
                ) + ner_overlap_score + num_overlap_score + significant_score
    featureVector = {}

    cnt = -1
    for feature in features:
        cnt += 1
        featureVector[cnt] = feature

    return featureVector
Esempio n. 2
0
    select_cmd[
        i] = "select text from query where create_time like '%d%%'" % date[i]

db = sqlite3.connect(db_path)
c = db.cursor()

for i in range(len(date)):
    c.execute(select_cmd[i])
    ans = c.fetchall()
    ans = [''.join(x) for x in ans]
    result = [jieba.lcut(each) for each in ans]
    for j in range(len(result)):
        result[j] = [
            word for word in result[j] if word not in [u' '] + stopwords
        ]
    #print get_cosine(Counter(result[0]),Counter(result[1]))
    G = nx.Graph()
    for k in range(len(result)):
        G.add_node(k)
    for x in range(len(result)):
        for y in range(x, len(result)):
            G.add_weighted_edges_from([(x, y,
                                        get_cosine(Counter(result[x]),
                                                   Counter(result[y])))])
    pr[i] = nx.pagerank(G)
    #get_cosine(ans[0],ans[1])
print pr[0]
c.close()
db.commit()
db.close()
Esempio n. 3
0
c.execute(select_cmd)
ans = c.fetchall()
ans = [''.join(x) for x in ans]
print len(set(ans))
result = [jieba.lcut(each) for each in ans]
print len(result)

result = result[:1000]
for j in range(len(result)):
    result[j] = [word for word in result[j] if word not in [u' '] + stopwords]
#print get_cosine(Counter(result[0]),Counter(result[1]))
print 'select finished...'
G = nx.Graph()
print len(result)
for x in range(len(result)):
    for y in range(x, len(result)):
        if get_cosine(Counter(result[x]), Counter(result[y])) != 0.0:
            #edges.append((x,y,get_cosine(Counter(result[x]),Counter(result[y]))))
            G.add_weighted_edges_from([(x, y, x + y)])

print 'graph edges added...'
pr = nx.pagerank(G)
print 'pagerank calculated...'
#get_cosine(ans[0],ans[1])
print pr

print 'Done!'
c.close()
db.commit()
db.close()
Esempio n. 4
0
    def __init__(self, str_q, str_a):

        # Clean the input (delete unnecessary space)
        self.str_q = str_q.strip()
        self.str_a = str_a.strip()

        # Tokenize
        self.bag_q = word_tokenize(self.str_q)
        self.bag_a = word_tokenize(self.str_a)

        # POS Tagging (As feature)
        self.bag_q_pos = nltk.pos_tag(self.bag_q)  # Tuple (word, POS)
        self.bag_a_pos = nltk.pos_tag(self.bag_a)  # Tuple (word, POS)

        # Remove stop words
        self.bag_q_sw_removed = [
            w for w in self.bag_q if w.lower() not in stop_words
        ]
        self.bag_a_sw_removed = [
            w for w in self.bag_a if w.lower() not in stop_words
        ]

        # Stem (As feature)
        self.bag_q_stemmed = []
        self.bag_a_stemmed = []
        for word in self.bag_q:
            self.bag_q_stemmed.append(stemmer.stem(word))
        for word in self.bag_a:
            self.bag_a_stemmed.append(stemmer.stem(word))

        # Lemmatize (As feature)
        self.bag_q_lemmatized = []
        self.bag_a_lemmatized = []
        for word in self.bag_q:
            self.bag_q_lemmatized.append(lemmatizer.lemmatize(word))
        for word in self.bag_a:
            self.bag_a_lemmatized.append(lemmatizer.lemmatize(word))

        # Tree Parse (As feature)
        self.sent_q = sent_tokenize(self.str_q)
        self.sent_a = sent_tokenize(self.str_a)
        self.parse_tree_q = parser.raw_parse_sents(self.sent_q)
        self.parse_tree_a = parser.raw_parse_sents(self.sent_a)

        # WordNet hypernymns, hyponyms, meronyms, AND holonyms (As feature)
        self.hypernymns = []
        self.hyponyms = []
        self.meronyms = []
        self.holonyms = []
        self.bag_counter = Counter(self.bag_q_sw_removed) + Counter(
            self.bag_a_sw_removed)
        for word in self.bag_counter.keys():
            synsets = wn.synsets(word)
            if synsets:
                max_cos = 0.0
                target_synset = None
                for synset in synsets:
                    definition = synset.definition()
                    cos = get_cosine(Counter(self.bag_q + self.bag_a),
                                     Counter(definition))
                    if cos > max_cos:
                        max_cos = cos
                        target_synset = synset
                if target_synset is None:
                    target_synset = synsets[0]
                if target_synset.hypernyms():
                    self.hypernymns += target_synset.hypernyms()
                if target_synset.hyponyms():
                    self.hyponyms += target_synset.hyponyms()
                if target_synset.part_meronyms():
                    self.meronyms += target_synset.part_meronyms()
                if target_synset.part_holonyms():
                    self.holonyms += target_synset.part_holonyms()
 ans_dict=OrderedDict()
 for i in ans:
     ans_dict[i]=ans_dict.get(i,0)+1
 result=[jieba.lcut(each) for each in ans_dict.keys()]
 print len(result)
 if not result:
     continue
 for j in range(len(result)):
     result[j]=[word for word in result[j] if word not in [u' ']+stopwords]
 #print get_cosine(Counter(result[0]),Counter(result[1]))
 #print 'select finished...'
 G=nx.Graph()
 #print len(result)
 for x in range(len(result)):
     for y in range(x,len(result)):
         if get_cosine(Counter(result[x]),Counter(result[y])) != 0.0:
         #edges.append((x,y,get_cosine(Counter(result[x]),Counter(result[y]))))
             G.add_weighted_edges_from([(x,y,ans_dict.values()[x]*ans_dict.values()[y]*get_cosine(Counter(result[x]),Counter(result[y])))])
 #print 'graph edges added...'
 pr=nx.pagerank(G)
 #print type(pr)
 #print pr
 pr_max=max(pr.values())
 for k in pr.keys():
     pr[k]=pr[k]/pr_max
 #pr_sort=sorted(pr.items(), key=lambda x: x[1],reverse=True)
 #pr_max_index=pr_sort[0][0]
 for k in range(len(result)):
     count[d]+=ans_dict.values()[k]*pr.values()[k]
     #count[d]+=ans_dict.values()[x]*ans_dict.values()[y]*get_cosine(Counter(result[pr_max]),Counter(result[k]))
 #print 'pagerank calculated...'
Esempio n. 6
0
        input_tree = parser.raw_parse_sents(input_sents)

        # WordNet hypernymns, hyponyms, meronyms, AND holonyms (As feature)
        input_hypernymns = []
        input_hyponyms = []
        input_meronyms = []
        input_holonyms = []
        input_bag_counter = Counter(input_sw_removed)
        for word in input_bag_counter.keys():
            synsets = wn.synsets(word)
            if synsets:
                max_cos = 0.0
                target_synset = None
                for synset in synsets:
                    definition = synset.definition()
                    cos = get_cosine(Counter(input_bag), Counter(definition))
                    if cos > max_cos:
                        max_cos = cos
                        target_synset = synset
                if target_synset is None:
                    target_synset = synsets[0]
                if target_synset.hypernyms():
                    input_hypernymns += target_synset.hypernyms()
                if target_synset.hyponyms():
                    input_hyponyms += target_synset.hyponyms()
                if target_synset.part_meronyms():
                    input_meronyms += target_synset.part_meronyms()
                if target_synset.part_holonyms():
                    input_holonyms += target_synset.part_holonyms()

        # Task 2 - Use statistical method (cosine-similarity) to calculate similarity of  user input and every faqs
Esempio n. 7
0
def text_rank(dateJson, source, model, wiki_model):

    data = []

    for item in source:
        if item['create_time'][0:8] == dateJson:
            data.append(item)
    if len(data) == 0:
        return

    print '# of records:' + str(len(data))

    print data[0]['create_time']
    dict_day = collections.OrderedDict()

    Graph = networkx.Graph()

    for item in data:
        for word in item['cut_text']:
            if word not in dict_day:
                dict_day[word] = 1
            else:
                dict_day[word] += 1

    dict_day = sorted(dict_day.iteritems(), key=lambda d: d[1], reverse=True)
    print dict_day[0]

    fre_files = open(path.hotwords_path_fre.format(dateJson), 'w+')
    try:
        for i in range(len(dict_day)):
            line = {'keyword': dict_day[i][0].encode('utf-8'), 'num': str(dict_day[i][1])}
            fre_files.write('{}\n'.format(json.dumps(line, ensure_ascii=False)))
    except IndexError:
        print 'done'
    fre_files.close()

    dict_day = [item for item in dict_day if item[1] > 100]

    print 'adding edges...'

    scale = len(dict_day)
    for x in range(scale):
        for y in range(x, scale):
            if x == y:
                continue
            sim = 0.1 * cosine.get_cosine(collections.Counter(dict_day[x][0].encode('utf-8')),
                                          collections.Counter(dict_day[y][0].encode('utf-8')))
            if dict_day[x][0] in model and dict_day[y][0] in model:
                sim += 0.2 * model.similarity(dict_day[x][0], dict_day[y][0])
            if dict_day[x][0] in wiki_model and dict_day[y][0] in wiki_model:
                sim += 0.7 * wiki_model.similarity(dict_day[x][0], dict_day[y][0])
            if sim > 0:
                Graph.add_edge(x, y, weight=sim)

    print '# of nodes: ' + str(Graph.number_of_nodes())
    print '# of edges: ' + str(Graph.number_of_edges())
    print 'pageranking...'

    pr = networkx.pagerank(Graph)

    result = [[dict_day[pr.keys()[j]][0], dict_day[pr.keys()[j]][1], pr.values()[j]] for j in range(len(pr.values()))]

    sum_of_fre = sum([item[1] for item in result])

    result = sorted(result, key=lambda d: d[2], reverse=True)
    pr_file = open(path.hotwords_path_pr.format(dateJson), 'w+')
    try:
        for i in range(len(result)):
            line = {'keyword': result[i][0].encode('utf-8'), 'num': str(result[i][1]),
                    'factor': str(result[i][2])}
            pr_file.write('{}\n'.format(json.dumps(line, ensure_ascii=False)))
    except IndexError:
        print 'done'
    pr_file.close()

    result = sorted(result, key=lambda d: d[2] * d[1], reverse=True)
    hot_words_file = open(path.hotwords_path_frepr.format(dateJson), 'w+')
    try:
        for i in range(len(result)):
            line = {'keyword': result[i][0].encode('utf-8'), 'num': str(result[i][1]),
                    'factor': str(result[i][2])}
            hot_words_file.write('{}\n'.format(json.dumps(line, ensure_ascii=False)))
    except IndexError:
        print 'done'
    hot_words_file.close()
c.execute(select_cmd)
ans=c.fetchall()
ans=[''.join(x) for x in ans]
print len(set(ans))
result=[jieba.lcut(each) for each in ans]
print len(result)

result=result[:1000]
for j in range(len(result)):
    result[j]=[word for word in result[j] if word not in [u' ']+stopwords]
#print get_cosine(Counter(result[0]),Counter(result[1]))
print 'select finished...'
G=nx.Graph()
print len(result)
for x in range(len(result)):
    for y in range(x,len(result)):
        if get_cosine(Counter(result[x]),Counter(result[y])) != 0.0:
        #edges.append((x,y,get_cosine(Counter(result[x]),Counter(result[y]))))
            G.add_weighted_edges_from([(x,y,x+y)])

print 'graph edges added...'
pr=nx.pagerank(G)
print 'pagerank calculated...'
#get_cosine(ans[0],ans[1])                      
print pr

print 'Done!'
c.close()
db.commit()
db.close()
Esempio n. 9
0
    stopwords[i]=stopwords[i].decode('utf8')
jieba.load_userdict("dict/event.txt")

select_cmd=[None]*len(date)
for i in range(len(date)):
    select_cmd[i]="select text from query where create_time like '%d%%'"%date[i]

db=sqlite3.connect(db_path)
c=db.cursor()

for i in range(len(date)):
    c.execute(select_cmd[i])
    ans=c.fetchall()
    ans=[''.join(x) for x in ans]
    result=[jieba.lcut(each) for each in ans]
    for j in range(len(result)):
        result[j]=[word for word in result[j] if word not in [u' ']+stopwords]
    #print get_cosine(Counter(result[0]),Counter(result[1]))
    G=nx.Graph()
    for k in range(len(result)):
        G.add_node(k)
    for x in range(len(result)):
        for y in range(x,len(result)):
            G.add_weighted_edges_from([(x,y,get_cosine(Counter(result[x]),Counter(result[y])))])                          
    pr[i]=nx.pagerank(G)
    #get_cosine(ans[0],ans[1])
print pr[0]
c.close()
db.commit()
db.close()