def get_answers(qid, site='stackoverflow.com'): if site.startswith('stacko'): # StackOverflow se = stackexchange.Site(stackexchange.StackOverflow) elif site.startswith('unix'): # Unix StackExchange se = stackexchange.Site(stackexchange.UnixampLinux) elif site.startswith('sup'): # Supa Hot Fire se = stackexchange.Site(stackexchange.SuperUser) elif site.startswith('ser'): # ServerFault se = stackexchange.Site(stackexchange.ServerFault) else: pass se.be_inclusive() question = se.question(qid) question_text = clean(question.title) + ' ' + clean( html2text(question.body)) answers = [] for answer in question.answers: answers.append(clean(html2text(answer.body))) return question_text, answers
def process(input_filename, gs_filename): dataset = [] with open(input_filename) as f: reader = csv.reader(f) dataset = [row for row in reader] dataset = clean(dataset) #sampled_dataset = sample(dataset, 20) #print("Result of N_Method:",n_method(sampled_dataset, 5)) #print("Result of P_Method:",p_method(sampled_dataset, 5)) #print("Testing the n method against the p method:",kolgomorov2samples(n_method(sampled_dataset, 5),p_method(sampled_dataset, 5))) #print("Test Z between methods n and p:",testz(n_method(sampled_dataset, 5),p_method(sampled_dataset, 5))) gs = [] with open(gs_filename) as f: reader = csv.reader(f) gs = [row for row in reader] y_true = correct(clean(gs), 3) output = [] for crowd_size in range(20, 81): p, r, f = metrics(dataset, y_true, crowd_size) output.append([crowd_size, 'precision'] + hypothesis_tests(p).tolist()) output.append([crowd_size, 'recall'] + hypothesis_tests(r).tolist()) output.append([crowd_size, 'f_measure'] + hypothesis_tests(f).tolist()) with open('output.csv', 'w') as f: writer = csv.writer(f, delimiter=';') for line in output: writer.writerow(line)
def predict(binsnum, dataPath): for f in listdir(dataPath): #read the test file if f == "test.csv": filename = dataPath + "/" + f test = pd.read_csv(filename) preprocess.clean(test, attributes) preprocess.discretisize(int(binsnum), test, attributes) test2 = test.drop(['class'], axis=1) for index, row in test2.iterrows(): calcprobY = 1 calcprobN = 1 # calculate the probability of each class given this row for att in attributes: if att[0] != 'class': calcprobY = calcprobY * (( (probs[att[0]])[row[att[0]]])['Y']) calcprobN = calcprobN * ( probs[att[0]][row[att[0]]]['N']) probY = probsClass['Y'] * calcprobY probN = probsClass['N'] * calcprobN #choose the class according to the higher probability if probY > probN: pred.append('yes') else: pred.append('no') file = open(dataPath + "/output.txt", "w") j = 0 for i in range(1, len(pred) + 1): file.write(str(i) + " " + pred[j] + "\n") j += 1 file.close()
def nn_predict(text1, text2, name): text1, text2 = clean(text1), clean(text2) seq1 = word2ind.texts_to_sequences([text1])[0] seq2 = word2ind.texts_to_sequences([text2])[0] pad_seq1 = pad_sequences([seq1], maxlen=seq_len) pad_seq2 = pad_sequences([seq2], maxlen=seq_len) model = map_item(name, models) prob = model.predict([pad_seq1, pad_seq2])[0][0] return '{:.3f}'.format(prob)
def ml_predict(text1, text2, name): text1, text2 = clean(text1), clean(text2) text = [text1, text2] sent = bow.transform(text) sent = svd.transform(sent) sent = merge(sent) model = map_item(name, models) prob = model.predict_proba(sent)[0][1] return '{:.3f}'.format(prob)
def predict(text, name, thre): text = clean(text) cut_text = ' '.join(jieba.cut(text)) words = cut_text.split() cands = set() for word in words: if word not in cands: cands.add(word) find(word, cands, homo_dict) find(word, cands, syno_dict) ind_set = set() match_sents, match_labels = list(), list() for cand in cands: if cand in word_sent: pairs = word_sent[cand] for sent_ind, label in pairs: if sent_ind not in ind_set: ind_set.add(sent_ind) match_sents.append(sent_ind) match_labels.append(label) if match_sents: if name == 'edit': return edit_predict(text, match_sents, match_labels, thre) else: return cos_predict(cut_text, match_sents, match_labels, thre) else: return '其它'
def __init__(self): self.wordSet = set() self.vocabGrowth = 0 self.vocabulary = {} self.vocabulary_inv = [] # Build Vocab with open('vocab.csv', 'rb') as f: reader = csv.reader(f, delimiter=',') for row in reader: if len(row) > 0: words = preprocess.clean(row[0]) for word in words: self.addWord(word) self.addWord(opts["sentence_padding_token"]) self.addWord(opts["unknown_word_token"]) self.vocabulary_size = len(self.wordSet) store.log("Vocabulary Size: %s" % self.vocabulary_size) self.embeddings = None self.data_index = 0 self.data = []
def calculate_interval(path): """ Given a transcript file, create snippets of time interval = interval. script = each line in the transcript (broken at the subtitle timing breaks) times = timestamps in the video corresponding to each of the lines in the script time_interval_index = a list where each element, i, is the index of the list times such that script[i] and script[times[i]] are >=interval seconds apart therefore, the window = snippet between script[i] and script[time_interval_index[i]] """ with open(path, 'r', encoding='utf-8') as f: text = clean(f.read()) script = [x[0] for x in text] times = [x[1] for x in text] time_interval_index = [] l = len(times) fmt = '%M:%S' for i in range(l): for j in range(i + 1, l): t1, t2 = times[i], times[j] tdelta = datetime.strptime(t2, fmt) - datetime.strptime(t1, fmt) if tdelta.total_seconds() >= interval: time_interval_index.append(j) break else: time_interval_index.append(l - 1) return script, time_interval_index, times
def compute_words_importance(text, averaged_most_probable_category): cleaned_text = clean(text) words = list(set(cleaned_text.split(' '))) words_with_texts = [ *map( lambda word: (word, ' '.join( [*filter(lambda text_word: text_word != word, words)])), words) ] results_without_each_word = [ *map( lambda group: (group[0], get_models_predictions(group[1])[ 'most_probable_category']), words_with_texts) ] return [ *map( lambda group: { 'word': group[0], 'importance': compute_category_difference(averaged_most_probable_category, group[1]) }, results_without_each_word) ]
def test_normal_case(self): path_test_demo = 'csv_for_testing/test_normal_case.csv' path_test_output = 'csv_for_testing/out_normal_case.csv' csvfile = open(path_test_demo, 'w') filewriter = csv.writer(csvfile) filewriter.writerow(['date', 'time', 'user', 'lightsOn']) filewriter.writerow(['2018-12-02', '04:00', '9351', '0']) filewriter.writerow(['2018-12-02', '04:00', '3440', '0']) filewriter.writerow(['2018-12-02', '04:00', '1688', '0']) csvfile.close() clean(path_test_demo, path_test_output) # open file with open(path_test_output, 'rb') as f: reader = csv.reader(f) assert len(list(reader)) == 3
def preprocess_dataset_h(data_path): print("\n\n**\nPreprocess of raw-dataset-H is started\n**") data = pd.read_csv(data_path, header=0) smiles_list=preprocess.sln_to_smiles(data['SLN'].values,verbose=0) inchi_list,inchikey_list=preprocess.smiles_to_inchi_inchikey(smiles_list,verbose=0) id_list=preprocess.generate_id_list("H",len(smiles_list)) name_list=preprocess.collect_names_from_web(inchikey_list,smiles_list,verbose=0) logs_list=data['Solubility'].values prediction_list=preprocess.collect_predictions_from_web(smiles_list,verbose=0) dataset_h_df = pd.DataFrame(np.column_stack([ id_list, name_list, inchi_list, inchikey_list, smiles_list, logs_list, prediction_list]), columns=[ 'ID', 'Name', 'InChI', 'InChIKey', 'SMILES', 'Solubility', 'Prediction']) #filter dataset by removing missing information. (for strings: "XXX" and for numeric: "999") dataset_h_df_clean = preprocess.clean(dataset_h_df) #update ID after filtering id_list=preprocess.generate_id_list("H",len(dataset_h_df_clean.index)) id_clean_df = pd.DataFrame({'ID': id_list}) dataset_h_df_clean.update(id_clean_df) #write dataset into CSV file dataset_h_df_clean.to_csv('../results/dataset-H.csv', index=False) print("**\nPreprocessed dataset-H is written into dataset-H.csv\n**") return
def test_missing_data(self): path_test_demo = 'csv_for_testing/test_missing_data.csv' path_test_output = 'csv_for_testing/out_missing_data.csv' with open(path_test_demo, 'w') as csvfile: filewriter = csv.writer(csvfile) filewriter.writerow(['date', 'time', 'user', 'lightsOn']) filewriter.writerow(['2018-10-02', '21:00']) csvfile.close() clean(path_test_demo, path_test_output) # open file len = 0 with open(path_test_output, 'rb') as f: reader = csv.reader(f) for row in reader: if '2018-10-02' in row and '21:00' in row \ and '9351' in row: len += 1 assert len == 0
def test_invalid_time(self): path_test_demo = 'csv_for_testing/test_invalid_time.csv' path_test_output = 'csv_for_testing/out_invalid_time.csv' csvfile = open(path_test_demo, 'w') filewriter = csv.writer(csvfile) filewriter.writerow(['date', 'time', 'user', 'lightsOn']) filewriter.writerow(['2018-10-02', '26:00', '9351', '0']) csvfile.close() clean(path_test_demo, path_test_output) # open file with open(path_test_output, 'rb') as f: reader = csv.reader(f) invalid_flag = True for row in reader: if '26:00' in row: invalid_flag = False assert invalid_flag
def __iter__(self): for r, d, files in os.walk(self.path): for f in files: if f.endswith('.txt'): file_path = os.path.join(r, f) with open(file_path, 'r', encoding='utf-8') as fp: clean_text = clean(fp.read()) for x in clean_text: yield x[0].split()
def predict(text, name): en_text = clean(text, 'en') en_words = en_text.split() en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, keep_oov=True) en_sent = torch.LongTensor([en_pad_seq]).to(device) encode = map_item(name + '_encode', models) decode = map_item(name + '_decode', models) with torch.no_grad(): encode.eval() state = encode(en_sent) decode.eval() return search(decode, state, en_sent, cand=3)
def score(filename, disp=True): cleaned_df = clean('oasis_longitudinal.csv') _, X_test, _, Y_test = split(cleaned_df) model = pickle.load(open(filename, 'rb')) Y_pred = model.predict(X_test) recall = recall_score(Y_test, Y_pred) accuracy = accuracy_score(Y_test, Y_pred) if disp: print(model) print(f"Accuracy = {accuracy}") print(f"Recall= {recall}") return model
def predict(text, name): text = clean(text) if name == 'svm' or name == 'xgb': probs = ml_predict(text, name) else: probs = nn_predict(text, name) sort_probs = sorted(probs, reverse=True) sort_inds = np.argsort(-probs) sort_preds = [ind_labels[ind] for ind in sort_inds] formats = list() for pred, prob in zip(sort_preds, sort_probs): formats.append('{} {:.3f}'.format(pred, prob)) return ', '.join(formats)
def predict(text, name, mode): text1 = clean(text) sent1 = ' '.join([text1, eos]) seq1 = word2ind.texts_to_sequences([sent1])[0] pad_seq1 = pad_sequences([seq1], maxlen=seq_len, padding='pre', truncating='pre') encode = map_item(name + '_encode', models) state = encode.predict(pad_seq1) decode = map_item(name + '_decode', models) func = map_item(mode, funcs) return func(decode, state, cand=3)
def predict(text, name): words = clean(text) bow_doc = word2ind.doc2bow(words) tfidf_doc = tfidf[bow_doc] model = map_item(name, models) pairs = model[tfidf_doc] probs = np.zeros(topic_num) for ind, score in pairs: probs[ind] = score formats = list() for prob in probs: formats.append('{:.3f}'.format(prob)) return ', '.join(formats)
def preprocess_text(text): text = sent_tokenize(text) out = [] for sentence in text: if type(sentence) == str: # clean text clean = preprocess.clean(sentence) # clean info clean = preprocess.clean_info(clean) out.append(clean) else: out.append("") return out
def buildModel(binsNum, dataPath): #open the Structure and train files for f in listdir(dataPath): if f == "Structure.txt": filename = dataPath + "/" + f file = open(filename, 'r') for line in file: name = line.split(" ")[1] if line.split(" ")[2][0] == 'N': values = 'NUMERIC' else: values = line.split("{")[1] values = values.replace("}", "") values = values.replace("\n", "") o = [name, values] attributes.append(o) if f == "train.csv": filename = dataPath + "/" + f df = pd.read_csv(filename) preprocess.clean(df, attributes) #complete the missing values preprocess.discretisize(int(binsNum), df, attributes) #discretisize the data makefit(df, binsNum) #build the model
def get_predictions(query, model_name, magic_string): query = clean(query) query = tokenize(query, magic_string, 'embedding' in model_name) query = np.expand_dims(query, 0) if 'conv' in model_name and 'embedding' not in model_name: query = np.expand_dims(query, 2) prediction = model.predict(query) prediction = prediction[0] indexed = list(enumerate(prediction)) weighted = sorted(indexed, key=lambda e: e[1], reverse=True) print('\n'.join([f"{map_[str(r[0])]}: {r[1]:.2f}" for r in weighted[:10]])) print()
def predict_text(value): # preprocess the given string # clean strings prepro = clean(value) if debug == True: print("clean() done:\n{}\n\n ".format(prepro)) logging.debug("clean() done:\n%s\n\n ", prepro) # remove punctuation prepro = remove_signs(prepro) if debug == True: print("remove_signs() done:\n{}\n\n ".format(prepro)) logging.debug("remove_signs() done:\n%s\n\n ", prepro) # lemmatize the text #prepro = lemm(prepro) #if debug == True: # print("lemm() done:\n{}\n\n".format(prepro)) # logging.debug("lemm() done:\n%s\n\n", prepro) # remove stop words #prepro = remove_stopwords(prepro) #if debug == True: # print("remove_stopwords() done:\n{}\n\n".format(prepro)) # logging.debug("remove_stopwords() done:\n%s\n\n", prepro) # preprocessing finished #### # predict with all available scikit models dt_preds = {} dt_probas = {} for mod_name, model in dt_cls.items(): if debug == True: #print("predicting with model name: {} and model: {}".format(mod_name, model)) logging.debug("predicting with model: %s", mod_name) proba = model.predict_proba(prepro) if proba[0] >= 0.5: pred = 'not abusive' else: pred = 'abusive' dt_preds[mod_name] = pred # confidence of correctness for current model # probability = "{0:.2%}".format(proba.max()) dt_probas[mod_name] = proba.tolist() #probability if debug == True: print("dt_probas: {}".format(dt_probas)) return (dt_preds, dt_probas)
def predict(text, name, thre): text = clean(text) words = list(jieba.cut(text)) label_pairs = map_item(name, feats) labels = list(label_pairs.keys()) scores = list() for pairs in label_pairs.values(): match_scores = list() for word in words: if word in pairs: match_scores.append(pairs[word]) if match_scores: scores.append(sum(match_scores) / len(words)) else: scores.append(0.0) return sort(scores, labels, thre, cand=5)
def predict(text, name): text = clean(text) pad_seq = sent2ind(text, word_inds, seq_len, keep_oov=True) sent = torch.LongTensor([pad_seq]).to(device) model = map_item(name, models) with torch.no_grad(): model.eval() probs = F.softmax(model(sent), dim=1) probs = probs.numpy()[0] sort_probs = sorted(probs, reverse=True) sort_inds = np.argsort(-probs) sort_preds = [ind_labels[ind] for ind in sort_inds] formats = list() for pred, prob in zip(sort_preds, sort_probs): formats.append('{} {:.3f}'.format(pred, prob)) return ', '.join(formats)
def predict(text, name): text = clean(text) seq = word2ind.texts_to_sequences([text])[0] pad_seq = pad_sequences([seq], maxlen=seq_len) model = map_item(name, models) probs = model.predict(pad_seq)[0] sort_probs = sorted(probs, reverse=True) sort_inds = np.argsort(-probs) sort_preds = [ind_labels[ind] for ind in sort_inds] formats = list() for pred, prob in zip(sort_preds, sort_probs): formats.append('{} {:.3f}'.format(pred, prob)) if name == 'adnn': core = map_item(name + '_core', models) atts = core.predict(pad_seq)[0] plot_att(text, atts[-len(text):]) return ', '.join(formats)
def eval(self, item): sentence = preprocess.clean(item) padded_sentence = preprocess.pad(sentence) word_ids = [] # get word id's for word in padded_sentence: id = vocab.getIdFromWord(word) word_ids.append(id) # run evaluation result = self.model.eval(np.array(word_ids)) print("eval:: {0}: \"{1}\"".format(result, item)) import sys sys.stdout.flush() self.redis.publish( "server", json.dumps({ "sentence": item, "classification": result }))
def pure_model(choices, query, model, magic_string, model_name, return_weights=False): query = clean(query) query = tokenize(query, magic_string, 'embedding' in model_name) query = np.expand_dims(query, 0) if 'conv' in model_name and 'embedding' not in model_name: query = np.expand_dims(query, 2) prediction = model.predict(query) prediction = prediction[0] indexed = list(enumerate(prediction)) weighted = sorted(indexed, key=lambda e: e[1], reverse=True) if not return_weights: return [choices[r[0]]['name'] for r in weighted[:10]] return [(choices[r[0]]['name'], r[1]) for r in weighted[:10]]
def predict(text, name): en_text = clean(text, 'en') en_text = ' '.join([en_text, eos]) en_words = en_text.split() en_pad_seq = sent2ind(en_words, en_word_inds, seq_len, 'pre', keep_oov=True) en_sent = torch.LongTensor([en_pad_seq]).to(device) encode = map_item(name + '_encode', models) decode = map_item(name + '_decode', models) with torch.no_grad(): encode.eval() state = encode(en_sent) decode.eval() zh_pred = search(decode, state, cand=3) if name == 'att' and __name__ == '__main__': zh_text = bos + zh_pred zh_pad_seq = sent2ind(zh_text, zh_word_inds, seq_len, 'post', keep_oov=True) zh_sent = torch.LongTensor([zh_pad_seq]).to(device) core = map_item(name + '_core', models) atts = core(zh_sent, state)[0] plot_att(en_words[:-1], zh_text[1:] + eos, atts) return zh_pred
def mixed_model(choices, query, model, magic_string, model_name, return_weights=False): names = [s['name'] for s in choices] fuzzy_results = process.extract(query, names, scorer=fuzz.ratio) fuzzy_sum = max(sum(r[1] for r in fuzzy_results), 0.001) fuzzy_matches_and_confidences = [(r[0], r[1] / fuzzy_sum) for r in fuzzy_results] # net query = clean(query) query = tokenize(query, magic_string, 'embedding' in model_name) query = np.expand_dims(query, 0) if 'conv' in model_name and 'embedding' not in model_name: query = np.expand_dims(query, 2) prediction = model.predict(query) prediction = prediction[0] indexed = list(enumerate(prediction)) weighted = sorted(indexed, key=lambda e: e[1], reverse=True) net_weighted = [(choices[r[0]]['name'], r[1]) for r in weighted] sorted_weighted = sorted(fuzzy_matches_and_confidences + net_weighted, key=lambda e: e[1], reverse=True) # build results list, unique results = [] weights = [] for r in sorted_weighted: if r[0] not in results: results.append(r[0]) weights.append(r[1]) if not return_weights: return results return list(zip(results, weights))
def train(): sentences = [] labels = [] x = [] y = [] _y = [] with open('data.csv', 'rb') as f: reader = csv.reader(f, delimiter=',') for row in reader: words = preprocess.clean(row[1]) sentences.append(words) labels.append(([0, 1] if row[0] == "example" else [1, 0])) _y.append(1 if row[0] == "example" else 0) padded_sentences = [ preprocess.pad(sentence) for sentence in sentences ] x = np.array([[vocab.getIdFromWord(word) for word in sentence] for sentence in padded_sentences]) embeddings = np.array(map(np.unique, ([ vocab.getEmbeddingFromWord(word) for word in sentence for sentence in padded_sentences]))) store.log(embeddings) store.log(len(embeddings)) store.log(embeddings[0]) store.log(len(embeddings[0])) y = np.array(labels) # Split Dataset # ================================================== # Load data print("Loading data...") # Randomly shuffle data sss = StratifiedShuffleSplit(_y, 1, test_size=0.1, random_state=0) for train, test in sss: x_train = x[train] y_train = y[train] x_dev = x[test] y_dev = y[test] # Training # ================================================== with tf.Graph().as_default(): session_conf = tf.ConfigProto( allow_soft_placement=opts["allow_soft_placement"], log_device_placement=opts["log_device_placement"]) sess = tf.Session(config=session_conf) with sess.as_default(): cnn = TextCNN( sequence_length=x_train.shape[1], num_classes=2, vocab_size=len(embeddings), embedding_size=opts["embedding_dim"], embedding_tensor=embeddings, filter_sizes=map(int, opts["filter_sizes"].split(",")), num_filters=opts["num_filters"], l2_reg_lambda=opts["l2_reg_lambda"]) # Define Training procedure global_step = tf.Variable(0, name="global_step", trainable=False) optimizer = tf.train.AdamOptimizer(1e-4) grads_and_vars = optimizer.compute_gradients(cnn.loss) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) saver = tf.train.Saver(tf.all_variables()) # Initialize all variables sess.run(tf.initialize_all_variables()) def train_step(x_batch, y_batch): """ A single training step """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: opts["dropout_keep_prob"] } _, step, loss, accuracy = sess.run( [train_op, global_step, cnn.loss, cnn.accuracy], feed_dict) def dev_step(x_batch, y_batch): """ Evaluates model on a dev set """ feed_dict = { cnn.input_x: x_batch, cnn.input_y: y_batch, cnn.dropout_keep_prob: 1.0 } step, loss, accuracy = sess.run( [global_step, cnn.loss, cnn.accuracy], feed_dict) time_str = datetime.datetime.now().isoformat() print("{}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy)) # Generate batches batches = batch_iter( zip(x_train, y_train), opts["batch_size"], opts["num_epochs"]) # Training loop. For each batch... for batch in batches: x_batch, y_batch = zip(*batch) train_step(x_batch, y_batch) current_step = tf.train.global_step(sess, global_step) if current_step % opts["evaluate_every"] == 0: print("\nEvaluation:") dev_step(x_dev, y_dev) print("") saver.save(sess, opts["model_location"] + "model.chpt")
#coding:utf-8 # predict corpus and save results to a file from preprocess import clean from jc_model import jc_model testset_file = '../TestSet/Test5000' result_file1 = '../TestSet/Pred5000(byTrainSet50)' result_file2 = '../TestSet/Pred5000(byTrainSet250)' if __name__ == '__main__': model = jc_model() fi = open(testset_file, 'r') fo = open(result_file1, 'w') # fo = open(result_file2, 'w') while True: line = fi.readline().decode("utf-8") if len(line) == 0: # Zero length indicates EOF break line = line.rstrip('\n') seg_list = line.split('\t') id = seg_list[0] text = seg_list[1] text_cleaned = clean(text) pred_label = model.classify(text_cleaned) string = id + '\t' + str(pred_label) + '\t' + text + '\n' fo.write(string.encode('utf-8')) fi.close() fo.close()