Python clean Exemples, preprocess.clean Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : stack.py Projet : mesh12/spoken-tutorial

def get_answers(qid, site='stackoverflow.com'):
    if site.startswith('stacko'):
        # StackOverflow
        se = stackexchange.Site(stackexchange.StackOverflow)
    elif site.startswith('unix'):
        # Unix StackExchange
        se = stackexchange.Site(stackexchange.UnixampLinux)
    elif site.startswith('sup'):
        # Supa Hot Fire
        se = stackexchange.Site(stackexchange.SuperUser)
    elif site.startswith('ser'):
        # ServerFault
        se = stackexchange.Site(stackexchange.ServerFault)
    else:
        pass

    se.be_inclusive()

    question = se.question(qid)
    question_text = clean(question.title) + ' ' + clean(
        html2text(question.body))
    answers = []
    for answer in question.answers:
        answers.append(clean(html2text(answer.body)))
    return question_text, answers

Exemple #2

0

Afficher le fichier

def process(input_filename, gs_filename):
    dataset = []
    with open(input_filename) as f:
        reader = csv.reader(f)
        dataset = [row for row in reader]

    dataset = clean(dataset)
    #sampled_dataset = sample(dataset, 20)
    #print("Result of N_Method:",n_method(sampled_dataset, 5))
    #print("Result of P_Method:",p_method(sampled_dataset, 5))
    #print("Testing the n method against the p method:",kolgomorov2samples(n_method(sampled_dataset, 5),p_method(sampled_dataset, 5)))
    #print("Test Z between methods n and p:",testz(n_method(sampled_dataset, 5),p_method(sampled_dataset, 5)))

    gs = []
    with open(gs_filename) as f:
        reader = csv.reader(f)
        gs = [row for row in reader]

    y_true = correct(clean(gs), 3)

    output = []
    for crowd_size in range(20, 81):
        p, r, f = metrics(dataset, y_true, crowd_size)
        output.append([crowd_size, 'precision'] + hypothesis_tests(p).tolist())
        output.append([crowd_size, 'recall'] + hypothesis_tests(r).tolist())
        output.append([crowd_size, 'f_measure'] + hypothesis_tests(f).tolist())

    with open('output.csv', 'w') as f:
        writer = csv.writer(f, delimiter=';')
        for line in output:
            writer.writerow(line)

Exemple #3

0

Afficher le fichier

def predict(binsnum, dataPath):
    for f in listdir(dataPath):
        #read the test file
        if f == "test.csv":
            filename = dataPath + "/" + f
            test = pd.read_csv(filename)
            preprocess.clean(test, attributes)
            preprocess.discretisize(int(binsnum), test, attributes)
            test2 = test.drop(['class'], axis=1)
            for index, row in test2.iterrows():
                calcprobY = 1
                calcprobN = 1
                # calculate the probability of each class given this row
                for att in attributes:
                    if att[0] != 'class':
                        calcprobY = calcprobY * ((
                            (probs[att[0]])[row[att[0]]])['Y'])
                        calcprobN = calcprobN * (
                            probs[att[0]][row[att[0]]]['N'])
                probY = probsClass['Y'] * calcprobY
                probN = probsClass['N'] * calcprobN
                #choose the class according to the higher probability
                if probY > probN:
                    pred.append('yes')
                else:
                    pred.append('no')
            file = open(dataPath + "/output.txt", "w")
            j = 0
            for i in range(1, len(pred) + 1):
                file.write(str(i) + " " + pred[j] + "\n")
                j += 1
            file.close()

Exemple #4

0

Afficher le fichier

def nn_predict(text1, text2, name):
    text1, text2 = clean(text1), clean(text2)
    seq1 = word2ind.texts_to_sequences([text1])[0]
    seq2 = word2ind.texts_to_sequences([text2])[0]
    pad_seq1 = pad_sequences([seq1], maxlen=seq_len)
    pad_seq2 = pad_sequences([seq2], maxlen=seq_len)
    model = map_item(name, models)
    prob = model.predict([pad_seq1, pad_seq2])[0][0]
    return '{:.3f}'.format(prob)

Exemple #5

0

Afficher le fichier

def ml_predict(text1, text2, name):
    text1, text2 = clean(text1), clean(text2)
    text = [text1, text2]
    sent = bow.transform(text)
    sent = svd.transform(sent)
    sent = merge(sent)
    model = map_item(name, models)
    prob = model.predict_proba(sent)[0][1]
    return '{:.3f}'.format(prob)

Exemple #6

0

Afficher le fichier

def predict(text, name, thre):
    text = clean(text)
    cut_text = ' '.join(jieba.cut(text))
    words = cut_text.split()
    cands = set()
    for word in words:
        if word not in cands:
            cands.add(word)
            find(word, cands, homo_dict)
            find(word, cands, syno_dict)
    ind_set = set()
    match_sents, match_labels = list(), list()
    for cand in cands:
        if cand in word_sent:
            pairs = word_sent[cand]
            for sent_ind, label in pairs:
                if sent_ind not in ind_set:
                    ind_set.add(sent_ind)
                    match_sents.append(sent_ind)
                    match_labels.append(label)
    if match_sents:
        if name == 'edit':
            return edit_predict(text, match_sents, match_labels, thre)
        else:
            return cos_predict(cut_text, match_sents, match_labels, thre)
    else:
        return '其它'

Exemple #7

0

Afficher le fichier

    def __init__(self):

        self.wordSet = set()
        self.vocabGrowth = 0
        self.vocabulary = {}
        self.vocabulary_inv = []

        # Build Vocab
        with open('vocab.csv', 'rb') as f:
            reader = csv.reader(f, delimiter=',')

            for row in reader:
                if len(row) > 0:
                    words = preprocess.clean(row[0])
                    for word in words:
                        self.addWord(word)

        self.addWord(opts["sentence_padding_token"])
        self.addWord(opts["unknown_word_token"])
        self.vocabulary_size = len(self.wordSet)
        store.log("Vocabulary Size: %s" % self.vocabulary_size)


        self.embeddings = None
        self.data_index = 0
        self.data = []

Exemple #8

0

Afficher le fichier

Fichier : window.py Projet : mesh12/spoken-tutorial

def calculate_interval(path):
    """
	Given a transcript file, create snippets of time interval = interval.
	script = each line in the transcript (broken at the subtitle timing breaks)
	times = timestamps in the video corresponding to each of the lines in the script
	time_interval_index = a list where each element, i, is the index of the list times
		such that script[i] and script[times[i]] are >=interval seconds apart
		therefore, the window = snippet between script[i] and script[time_interval_index[i]]
	"""
    with open(path, 'r', encoding='utf-8') as f:
        text = clean(f.read())
    script = [x[0] for x in text]
    times = [x[1] for x in text]

    time_interval_index = []
    l = len(times)
    fmt = '%M:%S'
    for i in range(l):
        for j in range(i + 1, l):
            t1, t2 = times[i], times[j]
            tdelta = datetime.strptime(t2, fmt) - datetime.strptime(t1, fmt)
            if tdelta.total_seconds() >= interval:
                time_interval_index.append(j)
                break
        else:
            time_interval_index.append(l - 1)
    return script, time_interval_index, times

Exemple #9

0

Afficher le fichier

Fichier : predict.py Projet : gboduljak/machine-learning

def compute_words_importance(text, averaged_most_probable_category):
    cleaned_text = clean(text)
    words = list(set(cleaned_text.split(' ')))

    words_with_texts = [
        *map(
            lambda word: (word, ' '.join(
                [*filter(lambda text_word: text_word != word, words)])), words)
    ]
    results_without_each_word = [
        *map(
            lambda group: (group[0], get_models_predictions(group[1])[
                'most_probable_category']), words_with_texts)
    ]

    return [
        *map(
            lambda group: {
                'word':
                group[0],
                'importance':
                compute_category_difference(averaged_most_probable_category,
                                            group[1])
            }, results_without_each_word)
    ]

Exemple #10

0

Afficher le fichier

Fichier : test_feature_extraction.py Projet : aakashkishan/SmartHomeController

    def test_normal_case(self):
        path_test_demo = 'csv_for_testing/test_normal_case.csv'
        path_test_output = 'csv_for_testing/out_normal_case.csv'
        csvfile = open(path_test_demo, 'w')
        filewriter = csv.writer(csvfile)
        filewriter.writerow(['date', 'time', 'user', 'lightsOn'])
        filewriter.writerow(['2018-12-02', '04:00', '9351', '0'])
        filewriter.writerow(['2018-12-02', '04:00', '3440', '0'])
        filewriter.writerow(['2018-12-02', '04:00', '1688', '0'])
        csvfile.close()
        clean(path_test_demo, path_test_output)
        # open file
        with open(path_test_output, 'rb') as f:
            reader = csv.reader(f)

            assert len(list(reader)) == 3

Exemple #11

0

Afficher le fichier

def preprocess_dataset_h(data_path):

    
    print("\n\n**\nPreprocess of raw-dataset-H is started\n**")
    
    
    data = pd.read_csv(data_path, header=0)     
    smiles_list=preprocess.sln_to_smiles(data['SLN'].values,verbose=0)              
    
    inchi_list,inchikey_list=preprocess.smiles_to_inchi_inchikey(smiles_list,verbose=0)
    id_list=preprocess.generate_id_list("H",len(smiles_list))
    name_list=preprocess.collect_names_from_web(inchikey_list,smiles_list,verbose=0)
    logs_list=data['Solubility'].values
    prediction_list=preprocess.collect_predictions_from_web(smiles_list,verbose=0)
    
    dataset_h_df = pd.DataFrame(np.column_stack([ id_list, name_list, inchi_list, inchikey_list, smiles_list, logs_list, prediction_list]), 
                                   columns=[ 'ID', 'Name', 'InChI', 'InChIKey', 'SMILES', 'Solubility', 'Prediction'])
    
    
    #filter dataset by removing missing information. (for strings: "XXX" and for numeric: "999")
    dataset_h_df_clean = preprocess.clean(dataset_h_df) 
    
    
    #update ID after filtering
    id_list=preprocess.generate_id_list("H",len(dataset_h_df_clean.index))
    id_clean_df = pd.DataFrame({'ID': id_list})
    dataset_h_df_clean.update(id_clean_df)
    
    #write dataset into CSV file
    dataset_h_df_clean.to_csv('../results/dataset-H.csv', index=False)
    print("**\nPreprocessed dataset-H is written into dataset-H.csv\n**")
    
    return

Exemple #12

0

Afficher le fichier

Fichier : test_feature_extraction.py Projet : aakashkishan/SmartHomeController

 def test_missing_data(self):
     path_test_demo = 'csv_for_testing/test_missing_data.csv'
     path_test_output = 'csv_for_testing/out_missing_data.csv'
     with open(path_test_demo, 'w') as csvfile:
         filewriter = csv.writer(csvfile)
         filewriter.writerow(['date', 'time', 'user', 'lightsOn'])
         filewriter.writerow(['2018-10-02', '21:00'])
         csvfile.close()
         clean(path_test_demo, path_test_output)
         # open file
         len = 0
         with open(path_test_output, 'rb') as f:
             reader = csv.reader(f)
             for row in reader:
                 if '2018-10-02' in row and '21:00' in row \
                     and '9351' in row:
                     len += 1
             assert len == 0

Exemple #13

0

Afficher le fichier

Fichier : test_feature_extraction.py Projet : aakashkishan/SmartHomeController

    def test_invalid_time(self):
        path_test_demo = 'csv_for_testing/test_invalid_time.csv'
        path_test_output = 'csv_for_testing/out_invalid_time.csv'
        csvfile = open(path_test_demo, 'w')
        filewriter = csv.writer(csvfile)
        filewriter.writerow(['date', 'time', 'user', 'lightsOn'])
        filewriter.writerow(['2018-10-02', '26:00', '9351', '0'])
        csvfile.close()
        clean(path_test_demo, path_test_output)
        # open file
        with open(path_test_output, 'rb') as f:
            reader = csv.reader(f)

            invalid_flag = True
            for row in reader:
                if '26:00' in row:
                    invalid_flag = False
            assert invalid_flag

Exemple #14

0

Afficher le fichier

Fichier : word2vec_transcripts.py Projet : mesh12/spoken-tutorial

 def __iter__(self):
     for r, d, files in os.walk(self.path):
         for f in files:
             if f.endswith('.txt'):
                 file_path = os.path.join(r, f)
                 with open(file_path, 'r', encoding='utf-8') as fp:
                     clean_text = clean(fp.read())
                     for x in clean_text:
                         yield x[0].split()

Exemple #15

0

Afficher le fichier

Fichier : translate.py Projet : TensorUI/english_chinese_translate_2

def predict(text, name):
    en_text = clean(text, 'en')
    en_words = en_text.split()
    en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, keep_oov=True)
    en_sent = torch.LongTensor([en_pad_seq]).to(device)
    encode = map_item(name + '_encode', models)
    decode = map_item(name + '_decode', models)
    with torch.no_grad():
        encode.eval()
        state = encode(en_sent)
        decode.eval()
        return search(decode, state, en_sent, cand=3)

Exemple #16

0

Afficher le fichier

Fichier : infer.py Projet : shashank-m/Alzheimers-Detection

def score(filename, disp=True):
    cleaned_df = clean('oasis_longitudinal.csv')
    _, X_test, _, Y_test = split(cleaned_df)

    model = pickle.load(open(filename, 'rb'))
    Y_pred = model.predict(X_test)
    recall = recall_score(Y_test, Y_pred)
    accuracy = accuracy_score(Y_test, Y_pred)
    if disp:
        print(model)
        print(f"Accuracy = {accuracy}")
        print(f"Recall= {recall}")
    return model

Exemple #17

0

Afficher le fichier

def predict(text, name):
    text = clean(text)
    if name == 'svm' or name == 'xgb':
        probs = ml_predict(text, name)
    else:
        probs = nn_predict(text, name)
    sort_probs = sorted(probs, reverse=True)
    sort_inds = np.argsort(-probs)
    sort_preds = [ind_labels[ind] for ind in sort_inds]
    formats = list()
    for pred, prob in zip(sort_preds, sort_probs):
        formats.append('{} {:.3f}'.format(pred, prob))
    return ', '.join(formats)

Exemple #18

0

Afficher le fichier

Fichier : generate.py Projet : CyanYoung/english_response_generate

def predict(text, name, mode):
    text1 = clean(text)
    sent1 = ' '.join([text1, eos])
    seq1 = word2ind.texts_to_sequences([sent1])[0]
    pad_seq1 = pad_sequences([seq1],
                             maxlen=seq_len,
                             padding='pre',
                             truncating='pre')
    encode = map_item(name + '_encode', models)
    state = encode.predict(pad_seq1)
    decode = map_item(name + '_decode', models)
    func = map_item(mode, funcs)
    return func(decode, state, cand=3)

Exemple #19

0

Afficher le fichier

def predict(text, name):
    words = clean(text)
    bow_doc = word2ind.doc2bow(words)
    tfidf_doc = tfidf[bow_doc]
    model = map_item(name, models)
    pairs = model[tfidf_doc]
    probs = np.zeros(topic_num)
    for ind, score in pairs:
        probs[ind] = score
    formats = list()
    for prob in probs:
        formats.append('{:.3f}'.format(prob))
    return ', '.join(formats)

Exemple #20

0

Afficher le fichier

    def preprocess_text(text):
        text = sent_tokenize(text)
        out = []
        for sentence in text:
            if type(sentence) == str:
                # clean text
                clean = preprocess.clean(sentence)
                # clean info
                clean = preprocess.clean_info(clean)

                out.append(clean)
            else:
                out.append("")
        return out

Exemple #21

0

Afficher le fichier

def buildModel(binsNum, dataPath):
    #open the Structure and train files
    for f in listdir(dataPath):
        if f == "Structure.txt":
            filename = dataPath + "/" + f
            file = open(filename, 'r')
            for line in file:
                name = line.split(" ")[1]
                if line.split(" ")[2][0] == 'N':
                    values = 'NUMERIC'
                else:
                    values = line.split("{")[1]
                    values = values.replace("}", "")
                    values = values.replace("\n", "")
                o = [name, values]
                attributes.append(o)
        if f == "train.csv":
            filename = dataPath + "/" + f
            df = pd.read_csv(filename)
    preprocess.clean(df, attributes)  #complete the missing values
    preprocess.discretisize(int(binsNum), df,
                            attributes)  #discretisize the data
    makefit(df, binsNum)  #build the model

Exemple #22

0

Afficher le fichier

Fichier : interactive_search.py Projet : zhudotexe/avrae-search-nn

def get_predictions(query, model_name, magic_string):
    query = clean(query)
    query = tokenize(query, magic_string, 'embedding' in model_name)
    query = np.expand_dims(query, 0)
    if 'conv' in model_name and 'embedding' not in model_name:
        query = np.expand_dims(query, 2)

    prediction = model.predict(query)
    prediction = prediction[0]

    indexed = list(enumerate(prediction))
    weighted = sorted(indexed, key=lambda e: e[1], reverse=True)

    print('\n'.join([f"{map_[str(r[0])]}: {r[1]:.2f}" for r in weighted[:10]]))
    print()

Exemple #23

0

Afficher le fichier

def predict_text(value):
    # preprocess the given string
    # clean strings
    prepro = clean(value)
    if debug == True:
        print("clean() done:\n{}\n\n ".format(prepro))
        logging.debug("clean() done:\n%s\n\n ", prepro)
    # remove punctuation
    prepro = remove_signs(prepro)
    if debug == True:
        print("remove_signs() done:\n{}\n\n ".format(prepro))
        logging.debug("remove_signs() done:\n%s\n\n ", prepro)
    # lemmatize the text
    #prepro = lemm(prepro)
    #if debug == True:
    #    print("lemm() done:\n{}\n\n".format(prepro))
    #    logging.debug("lemm() done:\n%s\n\n", prepro)
    # remove stop words
    #prepro = remove_stopwords(prepro)
    #if debug == True:
    #    print("remove_stopwords() done:\n{}\n\n".format(prepro))
    #    logging.debug("remove_stopwords() done:\n%s\n\n", prepro)
    # preprocessing finished

    ####
    # predict with all available scikit models
    dt_preds = {}
    dt_probas = {}
    for mod_name, model in dt_cls.items():
        if debug == True:
            #print("predicting with model name: {} and model: {}".format(mod_name, model))
            logging.debug("predicting with model: %s", mod_name)

        proba = model.predict_proba(prepro)

        if proba[0] >= 0.5:
            pred = 'not abusive'
        else:
            pred = 'abusive'

        dt_preds[mod_name] = pred
        # confidence of correctness for current model
        # probability = "{0:.2%}".format(proba.max())
        dt_probas[mod_name] = proba.tolist()  #probability
        if debug == True:
            print("dt_probas: {}".format(dt_probas))

    return (dt_preds, dt_probas)

Exemple #24

0

Afficher le fichier

Fichier : match.py Projet : CyanYoung/chinese_word_match_2

def predict(text, name, thre):
    text = clean(text)
    words = list(jieba.cut(text))
    label_pairs = map_item(name, feats)
    labels = list(label_pairs.keys())
    scores = list()
    for pairs in label_pairs.values():
        match_scores = list()
        for word in words:
            if word in pairs:
                match_scores.append(pairs[word])
        if match_scores:
            scores.append(sum(match_scores) / len(words))
        else:
            scores.append(0.0)
    return sort(scores, labels, thre, cand=5)

Exemple #25

0

Afficher le fichier

def predict(text, name):
    text = clean(text)
    pad_seq = sent2ind(text, word_inds, seq_len, keep_oov=True)
    sent = torch.LongTensor([pad_seq]).to(device)
    model = map_item(name, models)
    with torch.no_grad():
        model.eval()
        probs = F.softmax(model(sent), dim=1)
    probs = probs.numpy()[0]
    sort_probs = sorted(probs, reverse=True)
    sort_inds = np.argsort(-probs)
    sort_preds = [ind_labels[ind] for ind in sort_inds]
    formats = list()
    for pred, prob in zip(sort_preds, sort_probs):
        formats.append('{} {:.3f}'.format(pred, prob))
    return ', '.join(formats)

Exemple #26

0

Afficher le fichier

def predict(text, name):
    text = clean(text)
    seq = word2ind.texts_to_sequences([text])[0]
    pad_seq = pad_sequences([seq], maxlen=seq_len)
    model = map_item(name, models)
    probs = model.predict(pad_seq)[0]
    sort_probs = sorted(probs, reverse=True)
    sort_inds = np.argsort(-probs)
    sort_preds = [ind_labels[ind] for ind in sort_inds]
    formats = list()
    for pred, prob in zip(sort_preds, sort_probs):
        formats.append('{} {:.3f}'.format(pred, prob))
    if name == 'adnn':
        core = map_item(name + '_core', models)
        atts = core.predict(pad_seq)[0]
        plot_att(text, atts[-len(text):])
    return ', '.join(formats)

Exemple #27

0

Afficher le fichier

    def eval(self, item):
        sentence = preprocess.clean(item)
        padded_sentence = preprocess.pad(sentence)
        word_ids = []

        # get word id's
        for word in padded_sentence:
            id = vocab.getIdFromWord(word)
            word_ids.append(id)

        # run evaluation
        result = self.model.eval(np.array(word_ids))
        print("eval:: {0}:  \"{1}\"".format(result, item))
        import sys
        sys.stdout.flush()
        self.redis.publish(
            "server", json.dumps({
                "sentence": item,
                "classification": result
            }))

Exemple #28

0

Afficher le fichier

Fichier : spell_evaluation.py Projet : zhudotexe/avrae-search-nn

def pure_model(choices,
               query,
               model,
               magic_string,
               model_name,
               return_weights=False):
    query = clean(query)
    query = tokenize(query, magic_string, 'embedding' in model_name)
    query = np.expand_dims(query, 0)
    if 'conv' in model_name and 'embedding' not in model_name:
        query = np.expand_dims(query, 2)

    prediction = model.predict(query)
    prediction = prediction[0]

    indexed = list(enumerate(prediction))
    weighted = sorted(indexed, key=lambda e: e[1], reverse=True)
    if not return_weights:
        return [choices[r[0]]['name'] for r in weighted[:10]]
    return [(choices[r[0]]['name'], r[1]) for r in weighted[:10]]

Exemple #29

0

Afficher le fichier

def predict(text, name):
    en_text = clean(text, 'en')
    en_text = ' '.join([en_text, eos])
    en_words = en_text.split()
    en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, 'pre', keep_oov=True)
    en_sent = torch.LongTensor([en_pad_seq]).to(device)
    encode = map_item(name + '_encode', models)
    decode = map_item(name + '_decode', models)
    with torch.no_grad():
        encode.eval()
        state = encode(en_sent)
        decode.eval()
        zh_pred = search(decode, state, cand=3)
        if name == 'att' and __name__ == '__main__':
            zh_text = bos + zh_pred
            zh_pad_seq = sent2ind(zh_text, zh_word_inds, seq_len, 'post', keep_oov=True)
            zh_sent = torch.LongTensor([zh_pad_seq]).to(device)
            core = map_item(name + '_core', models)
            atts = core(zh_sent, state)[0]
            plot_att(en_words[:-1], zh_text[1:] + eos, atts)
        return zh_pred

Exemple #30

0

Afficher le fichier

Fichier : spell_evaluation.py Projet : zhudotexe/avrae-search-nn

def mixed_model(choices,
                query,
                model,
                magic_string,
                model_name,
                return_weights=False):
    names = [s['name'] for s in choices]
    fuzzy_results = process.extract(query, names, scorer=fuzz.ratio)
    fuzzy_sum = max(sum(r[1] for r in fuzzy_results), 0.001)
    fuzzy_matches_and_confidences = [(r[0], r[1] / fuzzy_sum)
                                     for r in fuzzy_results]

    # net
    query = clean(query)
    query = tokenize(query, magic_string, 'embedding' in model_name)
    query = np.expand_dims(query, 0)
    if 'conv' in model_name and 'embedding' not in model_name:
        query = np.expand_dims(query, 2)

    prediction = model.predict(query)
    prediction = prediction[0]

    indexed = list(enumerate(prediction))
    weighted = sorted(indexed, key=lambda e: e[1], reverse=True)
    net_weighted = [(choices[r[0]]['name'], r[1]) for r in weighted]

    sorted_weighted = sorted(fuzzy_matches_and_confidences + net_weighted,
                             key=lambda e: e[1],
                             reverse=True)

    # build results list, unique
    results = []
    weights = []
    for r in sorted_weighted:
        if r[0] not in results:
            results.append(r[0])
            weights.append(r[1])
    if not return_weights:
        return results
    return list(zip(results, weights))

Exemple #31

0

Afficher le fichier

Fichier : Train.py Projet : jghibiki/NeuroticAdHominem

def train():

    sentences = []
    labels = []
    x = []
    y = []
    _y = []

    with open('data.csv', 'rb') as f:
        reader = csv.reader(f, delimiter=',')

        for row in reader:
            words = preprocess.clean(row[1])
            sentences.append(words)
            labels.append(([0, 1] if row[0] == "example" else [1, 0]))
            _y.append(1 if row[0] == "example" else 0)

    padded_sentences = [ preprocess.pad(sentence) for sentence in sentences ]

    x = np.array([[vocab.getIdFromWord(word) for word in sentence] for sentence in padded_sentences])
    embeddings = np.array(map(np.unique, ([ vocab.getEmbeddingFromWord(word) for word in sentence for sentence in padded_sentences])))
    store.log(embeddings)
    store.log(len(embeddings))
    store.log(embeddings[0])
    store.log(len(embeddings[0]))
    y = np.array(labels)


# Split Dataset
# ==================================================

# Load data
    print("Loading data...")
# Randomly shuffle data
    sss = StratifiedShuffleSplit(_y, 1, test_size=0.1, random_state=0)
    for train, test in sss:
        x_train = x[train]
        y_train = y[train]

        x_dev = x[test]
        y_dev = y[test]


# Training
# ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
          allow_soft_placement=opts["allow_soft_placement"],
          log_device_placement=opts["log_device_placement"])
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(
                sequence_length=x_train.shape[1],
                num_classes=2,
                vocab_size=len(embeddings),
                embedding_size=opts["embedding_dim"],
                embedding_tensor=embeddings,
                filter_sizes=map(int, opts["filter_sizes"].split(",")),
                num_filters=opts["num_filters"],
                l2_reg_lambda=opts["l2_reg_lambda"])

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(1e-4)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

            saver = tf.train.Saver(tf.all_variables())

            # Initialize all variables
            sess.run(tf.initialize_all_variables())

            def train_step(x_batch, y_batch):
                """
                A single training step
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: opts["dropout_keep_prob"]
                }
                _, step, loss, accuracy = sess.run(
                    [train_op, global_step, cnn.loss, cnn.accuracy],
                    feed_dict)

            def dev_step(x_batch, y_batch):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                  cnn.input_x: x_batch,
                  cnn.input_y: y_batch,
                  cnn.dropout_keep_prob: 1.0
                }
                step, loss, accuracy = sess.run(
                    [global_step, cnn.loss, cnn.accuracy],
                    feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))

            # Generate batches
            batches = batch_iter(
                zip(x_train, y_train), opts["batch_size"], opts["num_epochs"])
            # Training loop. For each batch...
            for batch in batches:
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % opts["evaluate_every"] == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev)
                    print("")

            saver.save(sess, opts["model_location"] + "model.chpt")

Exemple #32

0

Afficher le fichier

Fichier : main_pred.py Projet : Lonesome-George/nlp_project1

#coding:utf-8

# predict corpus and save results to a file

from preprocess import clean
from jc_model import jc_model

testset_file = '../TestSet/Test5000'
result_file1 = '../TestSet/Pred5000(byTrainSet50)'
result_file2 = '../TestSet/Pred5000(byTrainSet250)'

if __name__ == '__main__':
    model = jc_model()
    fi = open(testset_file, 'r')
    fo = open(result_file1, 'w')
    # fo = open(result_file2, 'w')
    while True:
        line = fi.readline().decode("utf-8")
        if len(line) == 0: # Zero length indicates EOF
            break
        line = line.rstrip('\n')
        seg_list = line.split('\t')
        id = seg_list[0]
        text = seg_list[1]
        text_cleaned = clean(text)
        pred_label = model.classify(text_cleaned)
        string = id + '\t' + str(pred_label) + '\t' + text + '\n'
        fo.write(string.encode('utf-8'))
    fi.close()
    fo.close()