def train(num_lsa_topics, k):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    #TOKENIZE
    xs = SentenceFragmentData.SentenceFragmentData()
    
    tokenizer = WordTokenizer.WordTokenizer(min_word_count = 5)
    tokenized_docs = tokenizer.tokenize(xs.documents)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics = num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(lsa.distance_matrix, num_lsa_topics)
    
    #Filter To just sm codes
    sm_code_lsa_matrix = ListHelper.filter_list_by_index(full_lsa_matrix, xs.sm_code_indices)
    
    #CLUSTER
    clusterer = Clusterer.Clusterer(k)
    labels = clusterer.Run(sm_code_lsa_matrix)

    #OUTPUT - Filter by SM Code only this time
    file_name_code_clusters = "LSA_SMCODES_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics)
    sm_codes_per_doc   = ListHelper.filter_list_by_index(xs.codes_per_document, xs.sm_code_indices)
    ClustersToFile.clusters_to_file(file_name_code_clusters, labels, sm_codes_per_doc, "Chicago")
    
    file_name_category_clusters = "LSA_Categories_Fragments_k-means_k_{0}_dims_{1}.csv".format(k, num_lsa_topics)
    categories_per_doc = ListHelper.filter_list_by_index(xs.categories_per_document, xs.sm_code_indices)
    ClustersToFile.clusters_to_file(file_name_category_clusters, labels, categories_per_doc, "Chicago")
    
    print "Finished processing lsa clustering for dims: {0} and k: {1}".format(num_lsa_topics, k)
コード例 #2
0
def train(num_lda_topics):

    logging.basicConfig(format='%(asctime)s : %(levelname)s :     %(message)s',
                        level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()

    tokenizer = dbnetwork.WordTokenizer(min_word_count=5)
    tokenized_docs = tokenizer.tokenize(xs.documents)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lda = Lda.Lda(tfidf, num_topics=num_lda_topics)

    # Pull out topic topic_labels
    topic_labels = extract_topic_labels(lda.distance_matrix)

    #OUTPUT

    file_name_code_clusters = "LDA_SMCODES_topics_{0}.csv".format(
        num_lda_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, topic_labels,
                                    xs.codes_per_document, "Chicago")

    file_name_category_clusters = "LDA_categories_topics_{0}.csv".format(
        num_lda_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters, topic_labels,
                                    xs.categories_per_document, "Chicago")

    print "Finished processing lda clustering for dims: {0}".format(
        num_lda_topics)
コード例 #3
0
ファイル: Test.py プロジェクト: Rafinha19/FIC
def main():
    my_tfidf = TfIdf.TfIdf("tfidf_corpus.txt", DEFAULT_IDF=DEFAULT_IDF_UNITTEST)

    files = os.listdir("txt")
    n_files = len(files) - 150

    print("initializing information retrieval!\n")

    for i in range(n_files):
        print("Proccesing[" + str(int (i* 100/n_files) ) + "%]: (" + str(i) + ") " + files[i])
        file_act = open(".\\txt\\" + files[i], "r")
        string_act = ""
        for line in file_act.readlines():
            string_act += line

        my_tfidf.add_input_document(string_act)

    print("Process Finish!")
    my_tfidf.save_corpus_to_file("out_tfidf.txt", "out_stopword.txt")

    print("Starting Query Input!")

    while True:

        '''
        n = int(input("Choose nº document [Number of document]:"))
        if n >= 0 and n<n_files:
            print("File choosed: " + str(files[n]))
            file_act = open(".\\txt\\" + files[n], "r")
            string_act = ""
            for line in file_act.readlines():
                string_act += line

            q = input("Choose you query:")
            print(str(my_tfidf.get_tfipc(string_act,q)))
        '''
        q = input("Choose you query (#q to exit):")
        if q == "#q":
            break

        dic_tfipf = {}

        for i in range(n_files):
            print("Proccesing query[" + str(int(i * 100 / n_files)) + "%]: (" + str(i) + ") " + files[i])
            file_act = open(".\\txt\\" + files[i], "r")
            string_act = ""
            for line in file_act.readlines():
                string_act += line
            dic_tfipf[files[i]] = my_tfidf.get_tfipc(string_act,q)
            print(dic_tfipf[files[i]])

        sorted_dic_tfipf = OrderedDict(sorted(dic_tfipf.items(), key=itemgetter(1), reverse=True))

        print("\nSorted list of files with TF-IDF:\n")

        for file, tfipf_value in sorted_dic_tfipf.items():
            print("#" + str(tfipf_value) + " :->: " + file)
コード例 #4
0
    def test_on_data():
        import GwData
        import WordTokenizer
        import TfIdf

        import Converter
        import MatrixHelper

        data = GwData.GwData()
        tokenized = WordTokenizer.tokenize(data.documents)
        tfidf = TfIdf.TfIdf(data.documents)
コード例 #5
0
ファイル: Main.py プロジェクト: jcnm/HotSpot
def set_tfidf():
    articles = mod.articles.find()
    res = []
    l = []
    for i in articles:
        l.append((i["title"],i["keywords"]))
    articles = mod.articles.find()
    for article in articles:
        #print article["title"]
        x = (article["title"],article["keywords"])
        tf_idf = {word: tf.tfidf(word, x[1], l) for word in x[1].keys()}
        mod.articles.update({"title":x[0]},{"$set" : {"tfidf":tf_idf}})        
コード例 #6
0
    def __init__(self, num_topics, directory=None, min_sentence_length=3):
        if directory == None:
            directory = Settings.Settings().data_directory + "\GlobalWarming"

        if not directory.endswith("\\"):
            directory += "\\"

        self.directory = directory
        logging.log(logging.INFO,
                    "GwLsaClass: Processing Data from directory \n\t'%s'",
                    directory)

        lsa_file = "{0}lsa_{1}.lsi".format(directory, num_topics)
        id2Word_file = "{0}id2Word.txt".format(directory, num_topics)

        if os.path.isfile(lsa_file):
            pass
            #TODO
            #self.__lsa__ = LsiModel.load(lsa_file)
            #self.id2Word = corpora.Dictionary.load(id2Word_file)
            #return

        lines = self.__loadLines__("globalwarming_specific_space11.txt")
        lines.append("")

        sentences = []
        current = ""
        for line in lines:
            current += line
            if (len(line.strip()) == 0 and len(current.strip()) > 0):
                sent = nltk.sent_tokenize(current.strip().lower())

                sentences.extend(sent)
                current = ""

                #if len(sentences) > 100:
                #print " >> STOPPING EARLY TO SPEED DEBUGGING, PLEASE REMOVE"
                #break

        documents = []
        wt = WordTokenizer.WordTokenizer(min_word_count=3)
        tokenized = wt.tokenize(sentences)
        for tokenized_docs in tokenized:
            if len(tokenized_docs) >= min_sentence_length:
                documents.append(tokenized_docs)

        tfidf = TfIdf.TfIdf(documents)
        self.__lsa__ = Lsa.Lsa(tfidf, num_topics=num_topics)
        self.id2Word = tfidf.id2Word
        self.num_topics = num_topics
コード例 #7
0
def search():
    phasil = {}
    a = 0
    for x in range(len(content)):
        data[x] = preprocessing.preprocess(content[x], queryinp)
    hasil = TfIdf.__init__(data, queryinp)
    for x in range(len(hasil)):
        phasil[x] = printhasil(content[x], hasil[x])
    for key, value in sorted(phasil.items(),
                             key=lambda e: e[1][2],
                             reverse=True):
        if value[2] > 0.002:
            a += 1
            print(value[0] + '\n' + value[1][:100] + '\n' + value[1][100:200] +
                  '\n')
    print('got ' + str(a) + ' document')
コード例 #8
0
ファイル: Main.py プロジェクト: jcnm/HotSpot
def generate (n):
    if n>0:
        title = wikipedia.random(pages=1)
        try:
            page = wikipedia.WikipediaPage(title)
            summary = page.summary
            content = tb(page.content)
            tff = {word: tf.tf(word,content) for word in content.words}
            res = {}
            for i in tff:
                if tff[i] == 0.0:
                    continue
                else:
                    res[i] = tff[i]
            article = art.Article(title, summary, kw=res, tfidf={})
            mod.insert(article)
            generate(n-1)
        except wikipedia.exceptions.DisambiguationError : 
            generate(n)
        except bson.errors.InvalidDocument :
            generate(n)
def train(num_lsa_topics, k, window_size):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()

    tokenizer = dbnetwork.WordTokenizer(min_word_count=5)
    tokenized_docs = tokenizer.tokenize(xs.documents)
    windowed_docs, window_indices = split_documents_into_windows(
        tokenized_docs, window_size)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(windowed_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(
        lsa.distance_matrix, num_lsa_topics)

    #CLUSTER
    clusterer = Clusterer.Clusterer(k)
    window_labels = clusterer.Run(full_lsa_matrix)

    #Extract the labeld for the original sentences using the indices build earlier
    labels = pivot_window_labels(window_labels, window_indices)

    #OUTPUT
    file_name_code_clusters = "Windowed_LSA_SMCODES_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(
        window_size, k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, labels,
                                    xs.codes_per_document, "Chicago")

    file_name_category_clusters = "Windowed_LSA_Categories_win_size_{0}_k-means_k_{1}_dims_{2}.csv".format(
        window_size, k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters, labels,
                                    xs.categories_per_document, "Chicago")

    logging.info(
        "Finished processing lsa clustering for dims: {0} and k: {1}".format(
            num_lsa_topics, k))
コード例 #10
0
def main():

    #SETTINGS
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + GwData.FOLDER

    #TOKENIZE
    data = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(data.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)

    #NLTK Decision Tree
    np_matrix = MatrixHelper.gensim_to_numpy_array(tfidf.matrix,
                                                   initial_value=0)

    labels = data.causal_per_document

    def get_svm_val(x):
        if x <= 0:
            return -1
        return 1

    labels = map(get_svm_val, labels)

    td_size = int(0.75 * len(np_matrix))

    td_x = np_matrix[:td_size]
    td_y = labels[:td_size]

    vd_x = np_matrix[td_size:]
    vd_y = labels[td_size:]

    rng = array(range(1, 21, 1))

    c_vals = rng / 10.0

    all_results = ""
    for c in c_vals:
        classifier = svm.LinearSVC(C=c)
        classifier.fit(td_x, td_y)

        #RESULTS
        classifications = classifier.predict(vd_x)

        results = "\nC VALUE: " + str(c) + "\n"
        results += ResultsHelper.rfp(vd_y, classifications)
        print results

        all_results += results
    #print "EXPLAIN:\n"
    #me.explain(condensed_data[0], 100)

    #DUMP TO FILE
    fName = results_dir + "Causal_Relation_SVM.txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()

    #binary_matrix = term_freq.binary_matrix()
    #decision_tree = tree.DecisionTreeClassifier(criterion = 'entropy')
    #decision_tree.fit(binary_matrix, labels)

    # Test with CL1 labels
    raw_input("Press Enter to quit")
コード例 #11
0
        return self.distance_matrix[self.words[wd]].flatten().tolist()[0]

    def project(self, item):
        if type(item) == type(""):
            return self.project(item)

        l = []
        for w in item:
            if w in self.words:
                l.append(self.project(w))
        return l


if __name__ == "__main__":
    import GwData
    import TfIdf
    import WordTokenizer

    e = Embeddings()

    d = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(d.documents,
                                            min_word_count=1,
                                            stem=False,
                                            remove_stop_words=False)
    tf = TfIdf.TfIdf(tokenized_docs)

    ewds = set(e.words)

    dwds = set([w for w in tf.id2Word.values()])
    pass
コード例 #12
0
 def tfidf_vspace(self, tokenized_docs):
     tfidf = TfIdf.TfIdf(tokenized_docs)
     return (tfidf.distance_matrix, tfidf.id2Word)
コード例 #13
0
#get count of number of documents
#we know that this number gets updated only once every 6 hours. This would need to be modified if we are going to work with system in which articles are being added to the database
#constantly
lengthOfCorpus = tableWithDocs.count()

#tableWithDocs.query_count(last_name__eq='Doe')

#rows = tableWithDocs.scan(body__contains= 'obama')
#index = 0
#for row in rows:
#	index +=1
#print index
#exit()

tdIdfCalculator = TfIdf.TfIdf(lengthOfCorpus, numDocsWithKeyword, keyword)

columnWithBody = sys.argv[4]

columnWithUniqueId = sys.argv[5]

rows = tableWithDocs.scan()

#this is the table to store Tf-Idf value
tdidfIndexTable = Table('TfIdfNew', connection=db)

#tdidfTl = tdidfIndexTable.query_2(word__eq = 'obama')
#for row in tdidfTl:
#	print row['articleId']
#	print row['tdIdfRoundTo7']
コード例 #14
0
def train(num_lsa_topics, k):

    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)

    #TOKENIZE
    xs = SentenceData.SentenceData()

    tokenizer = WordTokenizer.WordTokenizer(min_word_count=5)
    tokenized_docs = tokenizer.tokenize(xs.documents)

    #MAP TO VECTOR AND SEMANTIC SPACE
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)
    full_lsa_matrix = MatrixHelper.gensim_to_python_mdarray(
        lsa.distance_matrix, num_lsa_topics)

    #TODO Partition into Docs by LSA sim
    txt_codes = xs.text_codes
    clusters_per_text_code = int(round(k / float((len(txt_codes)))))

    #Extract the sm code rows from LSA
    smCodeRows = ListHelper.filter_list_by_index(full_lsa_matrix,
                                                 xs.sm_code_indices)
    smCodeClassifications = ListHelper.filter_list_by_index(
        xs.codes_per_document, xs.sm_code_indices)
    smCodeCategoryClassifications = ListHelper.filter_list_by_index(
        xs.categories_per_document, xs.sm_code_indices)

    # Dict of <code, list[list]]> - LSA row vectors
    logging.info("Partitioning LSA distance_matrix by Source Document")

    txtMatrixByCode = PartitionByCode.partition(full_lsa_matrix, xs,
                                                xs.text_codes)
    closest_docs = [
        find_closest_document(txtMatrixByCode, row) for row in smCodeRows
    ]
    matrix_by_doc = collections.defaultdict(list)

    for i, doc in enumerate(closest_docs):
        matrix_by_doc[doc].append(smCodeRows[i])

    #Stores all cluster labels
    logging.info("Clustering within a document")
    all_smcode_labels = []
    label_offset = 0
    for doc in xs.text_codes:
        distance_matrix = matrix_by_doc[doc]
        #CLUSTER
        clusterer = Clusterer.Clusterer(clusters_per_text_code)
        labels = clusterer.Run(distance_matrix)
        all_smcode_labels = all_smcode_labels + [
            int(l + label_offset) for l in labels
        ]
        label_offset += clusters_per_text_code

    #OUTPUT
    file_name_code_clusters = "Partition_By_Doc_LSA_SMCODES_k-means_k_{0}_dims_{1}.csv".format(
        k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_code_clusters, all_smcode_labels,
                                    smCodeClassifications, "Chicago")

    file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.csv".format(
        k, num_lsa_topics)
    ClustersToFile.clusters_to_file(file_name_category_clusters,
                                    all_smcode_labels,
                                    smCodeCategoryClassifications, "Chicago")

    #TODO - filter the category and the docs per docs to the sm codes and output
    #file_name_category_clusters = "Partition_By_Doc_LSA_categories_k-means_k_{0}_dims_{1}.txt".format(k, num_lsa_topics)
    #ClustersToFile.clusters_to_file(file_name_category_clusters, all_smcode_labels, smCodeClassifications, "Chicago")

    print "Finished processing lsa clustering for dims: {0} and k: {1}".format(
        num_lsa_topics, k)
コード例 #15
0
 def lsa_vspace(self, tokenized_docs):
     tfidf = TfIdf.TfIdf(tokenized_docs)
     lsa = Lsa.Lsa(tfidf, self.num_topics)
     return (lsa.distance_matrix, lsa.id2Word)
def train():

    #SETTINGS
    cv_folds = 10
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        level=logging.INFO)
    settings = Settings.Settings()
    results_dir = settings.results_directory + +GwData.FOLDER
    num_lsa_topics = 100

    #TOKENIZE
    xs = GwData.GwData()
    tokenized_docs = WordTokenizer.tokenize(xs.documents, min_word_count=5)
    tfidf = TfIdf.TfIdf(tokenized_docs)
    lsa = Lsa.Lsa(tfidf, num_topics=num_lsa_topics)

    #NLTK SVM linear kernel
    xs = MatrixHelper.gensim_to_numpy_array(lsa.distance_matrix,
                                            initial_value=0)

    total_recall, total_precision, total_f1 = 0.0, 0.0, 0.0

    all_results = "LSA Dimensions: " + str(num_lsa_topics)
    print all_results

    processed_code_count = 0
    #MIN_CODE_COUNT = 5

    MIN_CODE_COUNT = 1

    codes = [
        c for c in xs.sm_codes
        # Exclude pure vague codes
        if c != "v" and
        # Exclude doc codes. Need whole doc to classify them
        not c.startswith("s")
    ]

    for code in codes:

        code_count = xs.sm_code_count[code]
        if code_count <= MIN_CODE_COUNT:
            continue

        processed_code_count += 1
        labels = map(Converter.get_svm_val, xs.labels_for(code))
        classifier = svm.LinearSVC(C=1)
        recall, precision, f1_score = cross_validation_score(xs,
                                                             labels,
                                                             classifier,
                                                             cv_folds,
                                                             class_value=1.0)
        results = "Code: {0} Count: {1}, Recall: {2}, Precision: {3}, F1: {4}\n".format(
            code.ljust(10), code_count, recall, precision, f1_score)

        all_results += results
        total_recall += recall
        total_precision += precision
        total_f1 += f1_score

        print results,

    #num_codes = len(xs.sm_codes)
    num_codes = processed_code_count
    result = "AGGREGATE\n\t Recall: {0}, Precision: {1}, F1: {2}\n".format(
        total_recall / num_codes, total_precision / num_codes,
        total_f1 / num_codes)
    all_results += result
    print result

    #DUMP TO FILE
    fName = results_dir + "Codes_ClassifyUsing_SVM_with_EssayBasedLSA_Dims_" + str(
        num_lsa_topics) + ".txt"
    handle = open(fName, mode="w+")
    handle.write(all_results)
    handle.close()
コード例 #17
0
    def __init__(self,
                 tokenized_docs,
                 latentSpaceFactory,
                 aggregation_method="doc",
                 normalize=False,
                 unit_vectors=False,
                 term_frequency_only=False):
        """
        Projects words to a vector space
        """
        tokenized_docs = [t for t in tokenized_docs if len(t) > 0]

        def pivot_by_words(dct, doc):
            for word1 in doc:
                for word2 in doc:
                    if word1 != word2:
                        dct[word1].append(word2)

        """ Pivot Docs Around Words """
        d = defaultdict(list)
        if aggregation_method == "doc":
            """ term - doc space """
            for i, doc in enumerate(tokenized_docs):
                for word in doc:
                    d[word].append(str(i))
        elif aggregation_method == "sentence":
            """ word space - words to words  """
            for i, doc in enumerate(tokenized_docs):
                self.pivot_by_words(d, doc)
        elif aggregation_method.startswith("window:"):
            _, str_size = aggregation_method.split(":")

            win_size = int(str_size)
            print "Window Size:", win_size

            win_id = 0
            for doc in tokenized_docs:
                windows = split_into_windows(doc, win_size)
                for win in windows:
                    for word in win:
                        d[word].append(str(win_id))
                        win_id += 1
                        #pivot_by_words(d, win)
            print "Size of windowed method:", len(d)
            pass
        else:
            raise Exception(
                "Unexpected aggregation_method value: %s. Accepted Values are <'doc','sentence','window:n> "
                % aggregation_method)

        tokenized_docs = d.values()
        self.word_to_index = dict()
        for i, wd in enumerate(d.keys()):
            self.word_to_index[wd] = i

        if term_frequency_only:
            tf = TermFrequency.TermFrequency(tokenized_docs)
            latent_space = latentSpaceFactory(tf, tokenized_docs)
        else:
            tfidf = TfIdf.TfIdf(tokenized_docs)
            latent_space = latentSpaceFactory(tfidf, tokenized_docs)
        """ Construct Vector Space """
        self.latent_vector = []
        for i, v in enumerate(latent_space):
            vec = [val for idx, val in v]
            """ Example Normalization """
            if unit_vectors:
                vec = unit_vector(vec)
            self.latent_vector.append(vec)
        """ Normalize """
        if normalize:
            tmp_arr = np.array(self.latent_vector)
            means = np.mean(tmp_arr, axis=0)
            sds = np.std(tmp_arr, axis=0)
            norm = (tmp_arr - means) / sds
            self.latent_vector = norm
        pass