def main(check):
    # Get data
    data = readJsonData('./dataset/politifact_results.json')
    dataset = []

    # Set settings
    checking = 'politifact'
    k = 13
    random_state = 1234
    width = 3

    # Split for Cross Validation
    # x_train, x_test = train_test_split(
    #     data, test_size=0.2, random_state=random_state)  # test = 40%, train = 60%

    for i in data.index:
        if i % 250 == 0:
            print(str(i))
        # Set query and targets
        query = ' '.join(preprocessing(data['original_article.content'][i]))
        query_url = data['original_article.url'][i]
        targets = []
        labels = []
        for v in data['extracted_articles'][i]:
            # Remove non necessary elements
            if check == 1:
                # Content must be more than n_grams length
                if not v or not v['content'] or not v['title'] or not v['url']:
                    continue

                # Preprocessing
                preprocessed_content = preprocessing(v['content'])
                string_preprocessed_content = ' '.join(preprocessed_content)
                targets.append(string_preprocessed_content)
                labels.append(v['url'])

            # Dont remove them
            else:
                # Content must be more than n_grams length
                if not v or not v['content'] or not v['title'] or not v['url']:
                    targets.append("")
                    labels.append("Empty" + str(i))
                    i += 1
                else:
                    # Preprocessing
                    preprocessed_content = preprocessing(v['content'])
                    string_preprocessed_content = ' '.join(
                        preprocessed_content)
                    targets.append(string_preprocessed_content)
                    labels.append(v['url'])

        dataset.append(
            simhash_1(labels, targets, query, query_url, checking, k, width))
        # print('-'*50)

    appendToDataset("./dataset/simhash_dataset.csv", dataset)
Beispiel #2
0
def main():
    # Get data
    data = readJsonData('./dataset/gossipcop_results.json')
    dataset = []

    # Split for Cross Validation
    # x_train, x_test = train_test_split(
    #     data, test_size=0.2, random_state=1234)  # test = 40%, train = 60%

    # Set settings
    min_jaccard_value = None
    n_gram = int(3)
    n_gram_type = 'term'
    n_permutations = int(100)
    no_of_bands = int(50)
    checking = 'gossipcop'

    for i in data.index:
        # Set query and targets
        preprocessed_query = preprocessing(data['original_article.content'][i])
        string_preprocessed_content = ' '.join(preprocessed_query)
        # Content must be more than n_grams length
        if n_gram_type == 'char' and len(string_preprocessed_content) < n_gram:
            continue
        if n_gram_type == 'term' and len(preprocessed_query) < n_gram:
            continue

        query = string_preprocessed_content
        targets = [query]
        labels = [data['original_article.url'][i]]
        for v in data['extracted_articles'][i]:
            # Content must be more than n_grams length
            if not v or not v['content'] or not v['title'] or not v['url']:
                continue

            # Preprocessing
            preprocessed_content = preprocessing(v['content'])
            string_preprocessed_content = ' '.join(preprocessed_content)

            # Content must be more than n_grams length
            if n_gram_type == 'char' and len(string_preprocessed_content) < n_gram:
                continue
            if n_gram_type == 'term' and len(preprocessed_content) < n_gram:
                continue

            targets.append(string_preprocessed_content)
            labels.append(v['url'])

        # find near duplicate sequences to `search_string`
        dataset.append(find_near_duplicate(checking, query, targets, labels,
                            min_jaccard_value, no_of_bands, n_permutations, n_gram, n_gram_type))
        # print('-'*50)

    appendToDataset("./dataset/lsh_dataset.csv", dataset)
def test():
    
    text = """
    LTE single-card dual-standby multi-mode terminal and method for processing concurrency of its CS service and PS service 

    The present invention is applicable to the field of communications technologies, and provides an method, the method includes: when a CS service and PS service of a local LTE single-card dual-standby multi-mode terminal are concurrent, detecting, by a local LTE single-card dual-standby multi-mode terminal, whether a peer communication terminal that is performing voice communication with it is in a voice silent period; when detecting that the peer communication terminal is not in the voice silent period, receiving, by the local LTE single-card dual-standby multi-mode terminal, downlink data in an LTE system, and suspending, by the local LTE single-card dual-standby multi-mode terminal, sending of uplink data in the LTE system at the same time; and when detecting that the peer communication terminal is in the voice silent period, sending the uplink data and receiving the downlink data, by the local LTE single-card dual-standby multi-mode terminal, in the LTE system.
    
    """
     
    bigram_measures = BigramAssocMeasures()
    #trigram_measures = TrigramAssocMeasures()
      
    # change this to read in your data
      
    finder = BigramCollocationFinder.from_words(preprocessing(text))
     
     
      
    # only bigrams that appear 3+ times
    #finder.apply_freq_filter(2)
      
    # return the 10 n-grams with the highest PMI
    #print(finder.nbest(bigram_measures.pmi,50))
    #print(finder.nbest(bigram_measures.likelihood_ratio, 20))
    #print(finder.nbest(bigram_measures.poisson_stirling, 20))
    for x,y in finder.nbest(bigram_measures.likelihood_ratio,50):
        print(x+' '+y)
Beispiel #4
0
 def createRandomForest(self, f, ds, train):
     bootstraps = bootstrap(matrix=train, n=self.ntree)
     trees = []
     accuracies = []
     for i, boots in enumerate(bootstraps):
         ds.dataMatrix = boots[0]
         x, y, attrList, possibleValuesList = preprocessing(f, ds)
         tree = self.generateTree(x, y, attrList, possibleValuesList)
         trees.append(tree)
     return trees
def TFIDF_1(docs, manuals, topN):

    _DOCS_NUM = len(docs)
    docs_words = []
    idfCount = {}

    print('PROCESS--IDF')
    for i, doc in enumerate(docs):
        sys.stdout.write("\r{0}/{1}".format(i + 1, _DOCS_NUM))
        sys.stdout.flush()
        words = preprocessing(doc.lower())

        docs_words.append(words)

        for word in set(words):
            if word in idfCount:
                idfCount[word] += 1
            else:
                idfCount[word] = 1

    tp = 0
    predicted_num = 0
    candidates = {}

    print('\nPROCESS--TFIDF and topN predict')
    for i, doc_words in enumerate(docs_words):
        sys.stdout.write("\r{0}/{1}".format(i + 1, len(docs_words)))
        sys.stdout.flush()
        tfCount = {}
        tfidf = {}
        _WORDS_NUM = 0
        for word in doc_words:
            if word in tfCount:
                tfCount[word] += 1
            else:
                tfCount[word] = 1
            _WORDS_NUM += 1

        for word, freq in tfCount.items():
            idf = math.log10(_DOCS_NUM / idfCount[word])
            tfidf[word] = freq / _WORDS_NUM * idf

        for word, value in tfidf.items():
            if word in candidates:
                candidates[word] += value
            else:
                candidates[word] = value

        predicted = dictTopN(candidates, topN)
        predicted_num += len(predicted)
        tp += sum(1 for word in predicted if word in manuals)

    statics(tp, predicted_num, len(manuals))
Beispiel #6
0
def test(file, examples):
    print "Testing bootstrap for: ", file
    f = files[file]
    x, y, attrList, possibleValuesList = preprocessing(f)
    print possibleValuesList

    dt = DecisionTree(x, y, attrList, possibleValuesList, int(len(
        x[0])**0.5))  # **1 for test dataset, **0.5 for the other ones
    dt.training()

    print "\nDecision Tree:\n"
    dt.printTree()

    for example in examples:
        print "\n", dt.predict(example)

    print "\n---------------------------------------------------\n"
def TFIDF(docs):

    _DOCS_NUM = len(docs)
    docs_bis = []
    #total_bis = []
    result = {}
    #bigram_measures = BigramAssocMeasures()
    bisFreDist = {}
    _BI_NUM = 0

    for doc in docs:
        #finder = BigramCollocationFinder.from_words(preprocessing(doc))
        #bi = finder.nbest(bigram_measures.likelihood_ratio,2000)
        bi = list(nltk.bigrams(preprocessing(doc)))
        docs_bis.append(bi)
        _BI_NUM += len(bi)
        #total_bis+=bi
        for word in bi:
            if word in bisFreDist:
                bisFreDist[word] += 1
            else:
                bisFreDist[word] = 1

    #bisFreDist = nltk.FreqDist(total_bis)
    #total_bis = None

    for word, freq in bisFreDist.items():
        try:
            count = sum(1 for doc_bis in docs_bis if word in doc_bis)
            idf = math.log10(_DOCS_NUM / count) + 0.01
            #print(idf,bi_TFdist.freq(word))
            (x, y) = word
            #result[x.lower()+' '+y.lower()] = bisFreDist.freq(word)*idf
            result[x.lower() + ' ' + y.lower()] = freq / _BI_NUM * idf
        except AttributeError:
            pass

    bisFreDist = None

    return result
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# These two lines are necessary to find source files!!!
import sys
sys.path.append('../src')

from files import files
from main import DataSet, preprocessing, DecisionTree

if __name__ == '__main__':
    f = files["test"]

    x, y, attrList, possibleValuesList = preprocessing(f)

    dt = DecisionTree(x, y, attrList, possibleValuesList, int(len(
        x[0])**1))  # **1 for test dataset, **0.5 for the other ones
    dt.training()

    print "\nDecision Tree:\n"
    dt.printTree()
Beispiel #9
0
from nltk.collocations import BigramAssocMeasures,BigramCollocationFinder
from main import preprocessing


text = """

A computer system provides a plug-in architecture for creation of a dynamic font. The computer system can incorporate a new filter function into a filtering layer of a font program. The filtering layer includes pre-defined filter functions to transform a base font into a new font. The computer system applies one or more font rules in the filtering layer to the base font. The font rules are implemented by the new filter function and at least one of the pre-defined filter functions to randomize an appearance of each character in a character string. The character string rendered with the new font has a dynamic and randomized appearance.

"""

bigram_measures = BigramAssocMeasures()
#trigram_measures = TrigramAssocMeasures()
 
# change this to read in your data
 
finder = BigramCollocationFinder.from_words(preprocessing(text))


 
# only bigrams that appear 3+ times
#finder.apply_freq_filter(2)
 
# return the 10 n-grams with the highest PMI
print(finder.nbest(bigram_measures.pmi,-1))
print(finder.nbest(bigram_measures.likelihood_ratio, -1))
print(finder.nbest(bigram_measures.poisson_stirling, -1))


""" 
d = ['09-2012', '04-2007', '11-2012', '05-2013', '12-2006', '05-2006', '08-2007']
sort_index = sorted(d, key=lambda x: datetime.datetime.strptime(x, '%m-%Y'))
def proposed(docs, manuals, topN, alpa):

    print('=== PROPOSED ====')

    _DOCS_NUM = len(docs)
    docs_words = []
    _WORDS_NUM = 0
    idfCount = {}
    tfCount = {}
    occurCount = {}
    """  Log-likelihood ratio bigrams """
    print('PROCESS--Log likelihood ratio bigrams')
    tmp = []
    for doc in docs:
        tmp += preprocessing(doc.lower())
    finder = BigramCollocationFinder.from_words(tmp)
    #_size = math.floor(len(finder.score_ngrams(BigramAssocMeasures().likelihood_ratio))/20)+1
    bigramSet = finder.nbest(BigramAssocMeasures().likelihood_ratio, 200)

    #     finder = TrigramCollocationFinder.from_words(tmp)
    #     _size = math.floor(len(finder.score_ngrams(TrigramAssocMeasures().likelihood_ratio))/40)+1
    #     trigramSet = finder.nbest(TrigramAssocMeasures().likelihood_ratio, _size)
    #     tmp = None

    #nltk.RegexpParser('{(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}')

    for i, doc in enumerate(docs):
        sys.stdout.write("\r{0}/{1}".format(i + 1, len(docs)))
        sys.stdout.flush()
        words = preprocessing(doc.lower())

        dealwith = set()
        for x, y in bigramSet:
            if x in words and y in words:
                if x not in dealwith:
                    words.remove(x)
                    dealwith.add(x)
                if y not in dealwith:
                    words.remove(y)
                    dealwith.add(y)
                words.append(x + ' ' + y)
            else:
                pass

#         for x,y,z in trigramSet:
#             if x in words and y in words and z in words:
#                 if x not in dealwith:
#                     words.remove(x)
#                     dealwith.add(x)
#                 if y not in dealwith:
#                     words.remove(y)
#                     dealwith.add(y)
#                 if z not in dealwith:
#                     words.remove(z)
#                     dealwith.add(z)
#                 words.append(x+' '+y+' '+z)
#             else:
#                 pass

        docs_words.append(words)

        _WORDS_NUM += len(words)
        """  count idf """
        for word in set(words):
            if word in idfCount:
                idfCount[word] += 1
            else:
                idfCount[word] = 1
        """  count occur and tf"""
        for j, word1 in enumerate(words):

            if word1 in tfCount:
                tfCount[word1] += 1
            else:
                tfCount[word1] = 1

            for word2 in words[j + 1:]:
                if (word1, word2) in occurCount:
                    occurCount[(word1, word2)] += 1
                elif (word2, word1) in occurCount:
                    occurCount[(word2, word1)] += 1
                else:
                    occurCount[(word1, word2)] = 1


#     """ Compute PMI"""
#     for (word1, word2) in occurCount:
#         val = round(math.log10(occurCount[(word1, word2)]*_WORDS_NUM/tfCount[word1]/tfCount[word2]), 8)
#         occurCount[(word1, word2)] = val
#
#     """  Construct graph """
#     g = nx.Graph()
#     for (word1, word2),value in occurCount.items():
#         g.add_edge(word1, word2, weight=value)
#     print('Graph node number: %s'%(g.number_of_nodes()))
#     print('Graph edge number: %s'%(g.number_of_edges()))
#     occurCount = None
#
#     rwrScore = RWR(g, None, 0.03 , 1000, 0.000003)

#     _min = min(rwrScore.values())
#     _max = max(rwrScore.values())
#     print(_min,_max)
#     for key,value in rwrScore.items():
#         rwrScore[key] = (value-_min)/(_max-_min)

    tp = 0
    predicted_num = 0
    candidates = {}

    for i, doc_words in enumerate(docs_words):
        sys.stdout.write("\r{0}/{1}".format(i + 1, len(docs_words)))
        sys.stdout.flush()
        thisTfCount = {}
        newScore = {}

        _NUM = len(doc_words)

        for word in doc_words:
            if word in thisTfCount:
                thisTfCount[word] += 1
            else:
                thisTfCount[word] = 1
        """ TF-IDF """
        for word, tf in thisTfCount.items():
            newScore[word] = tf / _NUM * math.log10(_DOCS_NUM / idfCount[word])

        _min = min(newScore.values())
        _max = max(newScore.values())

        for word, tfidf in newScore.items():
            #val = alpa*(tfidf-_min)/(_max-_min) + (1-alpa)*rwrScore[word]
            #val = tfidf + rwrScore[word]
            if word in candidates:
                candidates[word] += tfidf
            else:
                candidates[word] = tfidf

        predicted = dictTopN(candidates, topN)
        predicted_num += len(predicted)
        tp += sum(1 for word in predicted if word in manuals)

    statics(tp, predicted_num, len(manuals))
Beispiel #11
0
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import main

# In[2]:

# In[3]:

train_data = main.create_dataframe('train.raw')
test_data = main.create_dataframe('test.raw')

# In[4]:

train_processed = train_data.apply(
    lambda x: main.preprocessing(x, train_data)
    if x.name in ['sentence', 'words_left', 'words_right'] else x)
test_processed = test_data.apply(
    lambda x: main.preprocessing(x, test_data)
    if x.name in ['sentence', 'words_left', 'words_right'] else x)

# In[5]:

# In[28]:

# In[30]:

word2vec = api.load("glove-twitter-100")

# In[6]: