Python text_processing Examples, text_processing.text_processing Python Examples

Example #1

0

Show file

def semEval():
    #fname='100_topics_100_tweets.sentence-three-point.subtask-A'
    fname = '100_topics_100_tweets.topic-five-point.subtask-CE'
    #fname='100_topics_XXX_tweets.topic-two-point.subtask-BD'

    #train=read_semeval('texts/2download/gold/all/'+fname+'.all.gold.tsv')
    train = read_semeval('texts/2download/gold/train/' + fname +
                         '.train.gold.tsv')
    tp = text_processing(n=4)
    cp = char_processing()
    Xw_train = tp.fit_transform(train[0])
    #pos_score_train, neg_score_train=readSentiScores('texts/2download/train+.out')
    #add1=csr_matrix(pos_score_train).transpose()
    #add2=csr_matrix(neg_score_train).transpose()
    #Xw_train=scipy.sparse.hstack((scipy.sparse.hstack((Xw_train,add1)),add2), format='csr')

    Xc_train = cp.fit_transform(train[0])
    X_train = scipy.sparse.hstack((Xw_train, Xc_train), format='csr')
    #X_train=Xw_train
    y_train = np.asarray(train[1])

    #test=read_semeval('texts/2download/test_datasets-v2.0/SemEval2016-task4-test.subtask-BD.txt')
    test = read_semeval('texts/2download/gold/devtest/' + fname +
                        '.devtest.gold.tsv')
    #test=read_semeval('texts/2download/gold/dev/'+fname+'.dev.gold.tsv')
    Xw_test = tp.transform(test[0])
    #pos_score_test, neg_score_test=readSentiScores('texts/2download/test+.out')
    #add1=csr_matrix(pos_score_test).transpose()
    #add2=csr_matrix(neg_score_test).transpose()
    #Xw_test=scipy.sparse.hstack((scipy.sparse.hstack((Xw_test,add1)),add2), format='csr')

    Xc_test = cp.transform(test[0])
    X_test = scipy.sparse.hstack((Xw_test, Xc_test), format='csr')
    #X_test=Xw_test
    y_test = np.asarray(test[1])

    #perf=SVMperf(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test)
    #X_train, X_test, y_train, y_test=train_test_split(X_train,y_train, test_size=0.75)
    print(X_test.shape)
    X_test_list, y_test_list, utopics = split_by_topic(X_test, y_test, test[2])

    q = Quantification(method='Iter1', is_clean=True)
    #X_test, y_test=Quantification.make_drift_list(X_test.toarray(), y_test, proportion=0.2)
    q.fit(X_train, y_train)
    print('train', q._classify_and_count(y_train))

    prevs = q.predict_set(X_test_list, method='Iter1')
    write_semeval(dict(zip(utopics, prevs)), fname=fname)
    #forSVMperf(q,y_test,test[2],y_test_list, utopics)
    #q.iter_model.predict()

    print('CC', q.score(X_test_list, y_test_list, method='CC'))
    print('PCC', q.score(X_test_list, y_test_list, method='PCC'))
    print('EM', q.score(X_test_list, y_test_list, method='EM'))
    print('EM1', q.score(X_test_list, y_test_list, method='EM1'))
    print('Iter', q.score(X_test_list, y_test_list, method='Iter'))
    print('Iter1', q.score(X_test_list, y_test_list, method='Iter1'))
    print('ACC', q.score(X_test_list, y_test_list, method='ACC'))
    print('PACC', q.score(X_test_list, y_test_list, method='PACC'))

Example #2

0

Show file

File: quantification_test.py Project: Arctickirillas/Rubrication

def after_semEval():
    #fname='100_topics_100_tweets.sentence-three-point.subtask-A'
    #fname='100_topics_100_tweets.topic-five-point.subtask-CE'
    fname='100_topics_XXX_tweets.topic-two-point.subtask-BD'

    train=read_semeval('texts/2download/gold/all_train/'+fname+'.all.gold.tsv')
    #train=read_semeval('texts/2download/gold/all/'+fname+'.train.gold.tsv')
    tp=text_processing(n=1)
    #cp=char_processing()
    Xw_train=tp.fit_transform(train[0])
    #pos_score_train, neg_score_train=readSentiScores('texts/2download/train+.out')
    #add1=csr_matrix(pos_score_train).transpose()
    #add2=csr_matrix(neg_score_train).transpose()
    #Xw_train=scipy.sparse.hstack((scipy.sparse.hstack((Xw_train,add1)),add2), format='csr')

    #Xc_train=cp.fit_transform(train[0])
    #X_train=scipy.sparse.hstack((Xw_train, Xc_train), format='csr')
    X_train=Xw_train
    y_train=np.asarray(train[1])

    #test=read_semeval('texts/2download/test_datasets-v2.0/SemEval2016-task4-test.subtask-BD.txt')
    test=read_semeval('texts/2download/gold/test/test_gold_2.csv')
    #test=read_semeval('texts/2download/gold/dev/'+fname+'.dev.gold.tsv')
    Xw_test=tp.transform(test[0])
    #pos_score_test, neg_score_test=readSentiScores('texts/2download/test+.out')
    #add1=csr_matrix(pos_score_test).transpose()
    #add2=csr_matrix(neg_score_test).transpose()
    #Xw_test=scipy.sparse.hstack((scipy.sparse.hstack((Xw_test,add1)),add2), format='csr')

    #Xc_test=cp.transform(test[0])
    #X_test=scipy.sparse.hstack((Xw_test, Xc_test), format='csr')
    X_test=Xw_test
    y_test=np.asarray(test[1])

    #perf=SVMperf(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test)
    #X_train, X_test, y_train, y_test=train_test_split(X_train,y_train, test_size=0.75)
    print(X_test.shape)
    X_test_list, y_test_list, utopics=split_by_topic(X_test, y_test, test[2])

    q=Quantification(method='Iter1',is_clean=True)
    #X_test, y_test=Quantification.make_drift_list(X_test.toarray(), y_test, proportion=0.2)
    q.fit(X_train, y_train)
    print('train',q._classify_and_count(y_train))

    #prevs=q.predict_set(X_test_list, method='Iter1')
    #write_semeval(dict(zip(utopics,prevs)),fname=fname)
    #forSVMperf(q,y_test,test[2],y_test_list, utopics)
    #q.iter_model.predict()

    print('CC',q.score(X_test_list,y_test_list, method='CC'))
    print('PCC',q.score(X_test_list,y_test_list, method='PCC'))
    print('EM',q.score(X_test_list,y_test_list, method='EM'))
    print('EM1',q.score(X_test_list,y_test_list, method='EM1'))
    print('Iter',q.score(X_test_list,y_test_list, method='Iter'))
    print('Iter1',q.score(X_test_list,y_test_list, method='Iter1'))
    print('ACC',q.score(X_test_list,y_test_list, method='ACC'))
    print('PACC',q.score(X_test_list,y_test_list, method='PACC'))

Example #3

0

Show file

    def __init__(self, token):
        logging.basicConfig(
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            level=logging.INFO)
        self.logger = logging.getLogger("log")
        self.bot = Bot(token)
        self.updater = Updater(token)
        self.dispatcher = self.updater.dispatcher

        start_handler = CommandHandler('start', self.start)
        self.dispatcher.add_handler(start_handler)
        info_handler = CommandHandler('info', self.info)
        self.dispatcher.add_handler(info_handler)
        feedbac_handler = CommandHandler('feedback', self.feedback)
        self.dispatcher.add_handler(feedbac_handler)
        insertion_handler = CommandHandler('insert', self.insert)
        self.dispatcher.add_handler(insertion_handler)
        self.dispatcher.add_handler(CallbackQueryHandler(self.button_clicked))
        text_message_handler = MessageHandler(Filters.text, self.text_message)
        self.dispatcher.add_handler(text_message_handler)
        document_message_handler = MessageHandler(Filters.document,
                                                  self.document_message)
        self.dispatcher.add_handler(document_message_handler)
        self.dispatcher.add_handler(document_message_handler)
        self.dispatcher.add_error_handler(self.error)

        self.ner = Ner_Babel()
        self.prediction = Prediction()
        self.kb_interface = KB_interface()
        self.text_processing = text_processing()
        #O dicionario contem dicionarios que representam que tem o mesmo nome dos
        # metodos que requisitam precionar de butão pelo usuario, esses dict possuem as
        # variaveis uteis para lidar com essa ação.
        # O callback_query do botao será então a identidicação desse estado e botão precionado
        self.global_variables = {
            "last_question": "",
            "last_answer": "",
            "main_entity": "",
            "related_entity": "",
            "ambiguos_entities": [],
            "sugests_topics": {
                "buttons": []
            },
            "ask_for_answer_evaluation": {
                "buttons": ["P", "N"]
            },
            "sugests_question": {
                "buttons": []
            }
        }

Example #4

0

Show file

    def endElement(self, tag):
        # print(tag  + " ==== ends ")
        global i_count
        if self.doc_running == False:
            return

        if tag == "revision":
            self.revision = False

        elif tag == "contributor":
            self.contributor = False

        elif tag == "page":
            self.doc_running = False
            self.doc.category = ' '.join(
                re.findall(catRegExp, self.doc.text, flags=re.MULTILINE))
            self.doc.infobox = ' '.join(
                re.findall(infoRegExp, self.doc.text, re.DOTALL))
            self.doc.ref = ' '.join(
                re.findall(refRegExp, self.doc.text, flags=re.DOTALL))
            self.doc.text = remove_extra(self.doc.text)

            docid_title_map[int(self.doc.id)] = self.doc.title

            self.doc.text = text_processing(self.doc.text)
            self.doc.title = text_processing(self.doc.title)
            self.doc.comment = text_processing(self.doc.comment)
            self.doc.category = text_processing(self.doc.category)
            self.doc.infobox = text_processing(self.doc.infobox)
            self.doc.ref = text_processing(self.doc.ref)

            # =============================================================================
            #          try :
            #             if self.doc.id :
            #                self.doc.id = int(self.doc.id)
            #             if self.doc.r_id :
            #                self.doc.r_id = int(self.doc.r_id)
            #             if self.doc.con_id :
            #                self.doc.con_id = int(self.doc.con_id)
            #          except :
            #             print(self.doc.id)
            #             print(self.doc.r_id)
            #             print(self.doc.con_id)
            # =============================================================================

            doc_list.append(self.doc)
            if len(doc_list) == doc_chunk_size:
                i_count += 1
                fname = create_inverted_index(doc_list, i_count,
                                              index_folder_path)
                filenames.append(fname)
                #             i_count += 1
                #             path_to_save = "../index/i_index" + str(i_count) + ".txt"
                #             write_index_to_file(path_to_save, i_index)
                doc_list.clear()

            self.doc = ""

Example #5

0

Show file

File: preprocessing.py Project: shicongisme/kaggle-1

from keras.preprocessing import text, sequence
import pickle
from text_processing import text_processing
import os

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

max_features = 150000
maxlen = 600
embed_size = 300

train["comment_text"].fillna("fillna")
test["comment_text"].fillna("fillna")

text_processer = text_processing()
train["clean_comment"] = text_processer.remove_stopwords(train["comment_text"]) 
test["clean_comment"] = text_processer.remove_stopwords(test["comment_text"]) 
train["clean_comment"] = train["clean_comment"].apply(
    lambda x: text_processer.clean_text(x))
test["clean_comment"] = test["clean_comment"].apply(
    lambda x: text_processer.clean_text(x))
train["clean_comment"] = train["clean_comment"].apply(
    lambda x: text_processer.glove_preprocess(x))
test["clean_comment"] = test["clean_comment"].apply(
    lambda x: text_processer.glove_preprocess(x))

list_sentences_train = train["clean_comment"]
list_sentences_test = test["clean_comment"]

tokenizer = text.Tokenizer(num_words=max_features)

Example #6

0

Show file

    return total


total_files = len(os.listdir(testing_data))
count_file = 1

for file in os.listdir(testing_data):
    print("File " + str(count_file) + " out of " + str(total_files))
    count_file += 1

    file_path = testing_data + "\\" + file

    f = open(file_path, 'r')
    text_body = f.read()
    token_list = text_processing.text_processing(text_body)

    best_score = -inf
    best_class = None

    for name in list_of_classes:
        score = class_map_calculation(token_list, name)

        if score > best_score:
            best_score = score
            best_class = name

    result_dict[file] = best_class

with open("saved_result_dict.json", 'w') as f:
    json.dump(result_dict, f)

Example #7

0

Show file

File: algo1.py Project: sbunting96/10401Presentation

from text_processing import text_processing
from paperAlgo2 import algo2
import numpy

t = text_processing("feature1_svd", "feature2", "clustering_data", "attraction_mapping", "attraction_information", "feature2_keywords")
M = t.num_users

def algo1(lx, lu, su, D):
  converged = False
  N = 250
  allU = numpy.zeros((M, D, D))

  T = numpy.zeros(M)
  for m in range(0,M):
    actual = 0
    total = 0
    for cluster in t.get_user_clusters(m):
      size = len(cluster)
      actual = actual + size*(size-1)/2
      total = total+size
    T[m] = (actual*1.)/(total*1.)
  
  B = numpy.zeros((M,N,N))
  y = numpy.zeros((M,N,N))
  for m in range(0,M):
    clusters = t.get_user_clusters(m)
    for cluster in clusters:
      for i in cluster:
        for j in range(0,N):
          y[m,i,j] = 1 if j in cluster else -1
          B[m,i,j] = 1 if j in cluster else T[m]

Example #8

0

Show file

    probability_class_dict[folder] = (
        file_total / 16810)  # Get total file count through explorer

    for file in os.listdir(folder_path):
        print("File ",
              str(file_count) + " out of " + str(file_total),
              end="; ")
        print("Folder ", str(folder_count) + " out of " + str(folder_total))
        file_count += 1

        file_path = folder_path + "\\" + file

        f = open(file_path, 'r')

        text_body = f.read()
        token_list = text_processing.text_processing(
            text_body)  # Token list for all tokens in a file

        for token in token_list:
            if token not in class_dict:
                class_dict[token] = 1
            else:
                class_dict[token] += 1

end_time = time.time()
print("Training time: ", end_time - start_time)

with open("saved_master_dict.json", 'w') as f:
    json.dump(master_dict, f)
    f.close()

with open("saved_probability_class_dict.json", 'w') as f:

Example #9

0

Show file

File: search.py Project: SouparnaD/Wiki_Search_Engine

            field = w
            d[field] = ""
        else:
            try:
                d[field] += w + ' '
            except:
                d["all"] += w + ' '
    return d


#queries = f.readlines()

while True:
    print("enter query")
    query = input()
    start_time = time()
    q = check_fields(query)
    for key in q.keys():
        q[key] = text_processing(q[key])
    doc_ids = list(query_processing(q))

    if (len(doc_ids)) == 0:

        print("No Results found")
    else:

        for i in doc_ids[0:10]:
            print(doc_title_map[i], end="\r")
        print()
    print("time taken: %.2f\n" % (time() - start_time))