def main(args): data = json.load(open(args.input_refexps_json, 'r')) max_length = 0 all_refexps = [] for keys in data: for ref_id in data[keys]: all_refexps.append(data[keys][ref_id]) for r in all_refexps: t = tokenize( r, punct_to_keep=[',', ';'], punct_to_remove=['?', '.'] ) if len(t) > max_length: max_length = len(t) refexp_token_to_idx = build_vocab( all_refexps, punct_to_keep=[',', ';'], punct_to_remove=['?', '.'] ) with open(args.output_vocab_json, 'w') as f: json.dump(refexp_token_to_idx, f) with h5py.File(args.output_refexps_h5df, 'w') as f: for keys in data: one_image_refexps = [] # img_name = keys.split('.')[0] one_image_refexps_to_idx = [] img_all_refexps = data[keys] for ref_id in img_all_refexps: # refexp = img_all_refexps[ref_id] # one_image_refexps.append(refexp) refexp = img_all_refexps[ref_id] one_image_refexps.append(refexp) for refexps in one_image_refexps: tokens = tokenize(refexps, punct_to_remove=['?', '.'], punct_to_keep=[';', ',']) refexps_idx = encode(tokens, refexp_token_to_idx) one_image_refexps_to_idx.append(refexps_idx) for refexp_ in one_image_refexps_to_idx: num_null = max_length - len(refexp_) if num_null > 0: refexp_ += [refexp_token_to_idx['<NULL>']]*num_null one_image_refexps_to_idx_numpy = np.asarray(one_image_refexps_to_idx, dtype=np.int32) f.create_dataset(keys, data=one_image_refexps_to_idx_numpy)
def dataset(input_directory, batch_size, emb_dict, max_time): file_gen = os.listdir(input_directory) random.shuffle(file_gen) batch_num = 0 inputs = np.zeros((batch_size, max_time)) targets = np.zeros((batch_size, 2)) sequence_length = np.zeros((batch_size)) for name in file_gen: file_path = input_directory + '/' + name f = open(file_path) try: w = preprocess.tokenize(f.read()) except UnicodeDecodeError: print('Encountered unicode error, continuing') continue rating = int(re.sub('_|\.txt', ' ', name).split()[1]) targets[batch_num][0:2] = [0, 1] if rating < 5 else [1, 0] sequence_length[batch_num] = len(w) for time_num in range(min(max_time, len(w))): inputs[batch_num][time_num] = emb_dict.get(w[time_num], 0) batch_num += 1 if batch_num == batch_size: yield inputs, targets, sequence_length batch_num = 0 inputs = np.zeros((batch_size, max_time)) targets = np.zeros((batch_size, 2)) sequence_length = np.zeros((batch_size))
def titile_tokens(): cnt = 0 try: sw_path = os.path.join(cwd, "text/news_stopwords.txt") sw_list = prep.get_stopwords(sw_path) conn = MongoClient("127.0.0.1", 27017) db = conn.netease target = db.token_war.find({}) for i in target: if "title_keywords" not in i.keys() or not i['title_keywords']: cnt += 1 title_keywords = [] if len(i['keywords']) > 1: # article have keywords title_keywords = i['title'] + '\t\t' + ' '.join( i['keywords']) else: title_keywords = i['title'] tokens_string = ' '.join( prep.tokenize(title_keywords, sw_list=sw_list, language='CN')) db.token_war.update_one( {'number': i['number']}, {'$set': { 'title_keywords': tokens_string }}) print "%d title tokenize completed!" % cnt conn.close() except Exception as e: print "From:art_tokenize:\t\nUnexpect Error: {}".format(e)
def main(args): '''Example script to run prediction on a pre-trained model with sample lyrics dataset''' c = Configuration() if args.artist: c.set_artist(args.artist) print("Artist:", c.artist.replace("_", " ").title()) lyrics_dataset = pp.read_lyrics_files(c.path) dictionary = torch.load(open(c.dictionary_path, 'rb')) print("Vocabulary size: ", len(dictionary)) print("----------------------------") tokenized = pp.tokenize(lyrics_dataset) seed_lyrics = generate_seed_lyrics(tokenized, c.window_size, args.censored) model = torch.load(open(c.model_path, 'rb')) predicted_lyrics = predict(model, seed_lyrics, dictionary, num_words=args.words, topk=c.predict_topk) predicted_lyrics = postprocess(predicted_lyrics, args.censored) print(predicted_lyrics)
def art_tokenize(): cnt = 0 new = [] point = datetime(2018, 1, 1) try: conn = MongoClient("127.0.0.1", 27017) db = conn.netease target = db.war.find({'date': {'$gt': point}}) for i in target: if db.token_war.find({'number': i['number']}).count() < 1: tokens_list = prep.tokenize(i['content'], sw_list=[], language='CN') del i['content'], i['comments'], i['commenturl'], i[ 'tie_count'] i['tokens'] = ' '.join(tokens_list) new.append(i) cnt += 1 if new: # insert bulk db.token_war.insert_many(new) print "%d article tokenized completed!" % cnt return "%d article tokenized completed!" % cnt else: print "No new article requires tokenize!" return "No new article requires tokenize!" conn.close() except Exception as e: print "From:art_tokenize:\t\nUnexpect Error: {}".format(e)
def prepare_inputs(token_mapping, w2v_W, w2v_U, sentences): """ Converts a 2-D list of sentences (list of list of words) to one-hot encoded tokens of shape [n_sentences, n_words, len(token_mapping), 1]. """ tokens = [tokenize(token_mapping, sentence) for sentence in sentences] depth = len(token_mapping) one_hot_tokens = [] for sentence in tokens: one_hot_sentence = [] for i, token in enumerate(sentence): if token != token_mapping['#UNK#']: one_hot_sentence.append(one_hot_encode(token, depth)) else: if i <= 2: context_tokens = sentence[:i] + sentence[i + 1:i + 3] else: context_tokens = sentence[i - 2:i] + sentence[i + 1:i + 3] context_one_hot = [ one_hot_encode(token, depth) for token in context_tokens ] context_mean = np.mean(np.asarray(context_one_hot), axis=0) one_hot_sentence.append(context_mean) one_hot_tokens.append(one_hot_sentence) one_hot_tokens = [np.asarray(ls) for ls in one_hot_tokens] vec_tokens = [ word2vec(w2v_W, w2v_U, sentence) for sentence in tqdm(one_hot_tokens, desc='Vectorizing tokens') ] return vec_tokens
def main(): parser = argparse.ArgumentParser() parser.add_argument('--dir', default='data') args = parser.parse_args() model_file = os.path.join(args.dir, 'model.pickle') with open(model_file, 'rb') as f: model = pickle.load(f) label_encoder_file = os.path.join(args.dir, 'label_encoder.pickle') with open(label_encoder_file, 'rb') as f: label_encoder = pickle.load(f) vectorizer_file = os.path.join(args.dir, 'vectorizer.pickle') with open(vectorizer_file, 'rb') as f: vectorizer = pickle.load(f) tagger = MeCab.Tagger('-Owakati') while True: text = input() tokenized = preprocess.tokenize(tagger, text) x = vectorizer.transform([tokenized]) y = model.predict(x) label = label_encoder.inverse_transform(y)[0] print('Tokenized:', tokenized) print('Label:', label)
def categorize_this_happy_moment(happy_moment): input = preprocess.tokenize(happy_moment) input = lstm_convert_data([input], vocab) input = np.array(input) input = sequence.pad_sequences(input, maxlen=max_review_length) prediction = model.predict(input) max_index = np.argmax(np.array(prediction)) print(predict_category.get(max_index))
def __init__(self, review_filename): self.review_rating = int( re.search('_\d+', review_filename).group()[1:]) target_list = [0, 1] if self.review_rating > 5 else [1, 0] self.sentiment = 'pos' if self.review_rating > 5 else 'neg' review_file = open(review_filename).read() self.tokens = preprocess.tokenize(review_file) self.targets = np.array([target_list]) self.length = len(self.tokens)
def clean(s): s = preprocess(s) tokenize_s = tokenize(s) for item in tokenize_s: if not item.isalpha(): tokenize_s.remove(item) elif len(item) == 1: tokenize_s.remove(item) return ' '.join(tokenize_s)
def converse(): while True: # Function invoked at end of each epoch. Prints generated text. user_input = input('>> ') sentence_array = preprocess.tokenize(user_input) print('Response:') print(get_response(sentence_array, 50)) print()
def send(): userID = None if request.method == 'POST': userID = request.form['userID'] keyword = request.form['keyword'] print(type(userID)) print(type(int(userID))) result = 1 result = sameModel.recommendProducts(int(userID), 100) tmp = {} # key = business_num, value = business_rating lda_list = [] for line in result: tmp[line[1]] = line[2] lda_list.append(line[1]) [bn, br] = openfile() name_list = [] for key in lda_list: name_list.append(bn[key]) Recommendation = name_list[:10] #LDA if len(request.form['keyword']) >= 1: rawList = tokenize(br, lda_list) stopped_result = stop_words(rawList) [corpus, dictionary] = doc_term_matrix(stopped_result) lsiList = lsi(corpus, dictionary) #word matching document = list() for i in range(0, 99): document.append(list()) for i in range(0, 99): item = lsiList[i] for word in item: document[i].append(','.join([str(word[0])])) model = gensim.models.Word2Vec(document, min_count=1) a = [0] * 100 dic = {} for i in range(0, 99): business = document[i] for item in business: a[i] = a[i] + model.similarity(keyword, item) dic[a[i]] = lda_list[i] #!!!!!!!!!!!gaichicken!!!!!!!!!############################# #a = np.sort(a,axis=0) a = sorted(a, reverse=True) Recommendation = [] #a = np.ndarray.tolist(a) for key2 in a[0:10]: print(key2) Recommendation.append(bn[dic[key2]]) #Recommendation = json.dumps(result) return render_template('index.html', Recommendation=Recommendation) return render_template('index.html')
def process(smi): smis = smi.strip() # Only include compounds that exclusively # use tokens the model can generate if any(tok not in model.vocab2id for tok in tokenize(smis)): return None # Standardize SMILES return [molvs.standardize_smiles(smi) for smi in smis.split('.')]
def get_new_vec(model, docs, sw_path=default_sw_path, language="CN"): sw_list = get_stopwords(sw_path) new_doc_vec = [] try: for doc in docs: tokens = tokenize(doc, sw_list, language=language) doc_vec = model.infer_vector(tokens) new_doc_vec.append(doc_vec) return new_doc_vec except Exception as e: print "From get_new_vec:\n\tUnexpect Error:{}".format(e)
def __iter__(self): sw_list = get_stopwords(self.sw_path) f = open(self.file_path) csv_reader = csv.reader(f, delimiter='\t') for i, line in enumerate(csv_reader): if i + 1 > self.lines: self.lines = i + 1 self.label_list.append(line[0]) # get the doc label tag = "%s_%s" % (self.file_name, str(i)) # print '---1' yield doc2vec.TaggedDocument(tokenize(line[1], sw_list, self.t), tags=[tag])
def main(argv): emot_dic = {"anger":0, "joy":1, "sadness":2, "fear":3} x = tokenize(argv[0]) emot = np.array([0, 0, 0, 0]) emot[emot_dic[argv[1]]] = 1 model = get_model("model.h5") out = model.predict([x.reshape((1, 50, 1)), emot.reshape((1, 4))])[0] print(argv[1], ":", np.argmax(out))
def predict(doc): predictions = {} predictions['input'] = doc doc = tokenize(doc).rstrip() predictions['tokenized'] = doc doc_vector = vectorize(doc, word_vec, tfidf) for method in METHODS: predictions[method] = { 'rule': METHODS[method](doc), 'model': models[method].predict([doc_vector])[0] } return predictions
def BM25Similarity(Query, Passage, k1=1.5, b=0.75, delimiter=' ') : global docIDFDict,avgDocLength # query_words= Query.strip().lower().split(delimiter) # passage_words = Passage.strip().lower().split(delimiter) query_words=tokenize(Query.strip().lower()) passage_words = tokenize(Passage.strip().lower()) passageLen = len(passage_words) docTF = {} for word in set(query_words): #Find Term Frequency of all query unique words docTF[word] = passage_words.count(word) commonWords = set(query_words) & set(passage_words) tmp_score = [] for word in commonWords : numer = (docTF[word] * (k1+1)) #Numerator part of BM25 Formula denom = ((docTF[word]) + k1*(1 - b + b*passageLen/avgDocLength)) #Denominator part of BM25 Formula if(word in docIDFDict) : tmp_score.append(docIDFDict[word] * numer / denom) score = sum(tmp_score) return score
def main(args): '''Example script to train a model on the sample lyrics dataset''' c = Configuration() if args.artist: c.set_artist(args.artist) print("Hyperparameters: ", c) print("Loading data from path: ", c.path) lyrics_dataset = pp.read_lyrics_files(c.path) tokenized = pp.tokenize(lyrics_dataset) x, y, dictionary = pp.preprocess(tokenized, c.window_size) training_data = DataLoader(list(zip(x, y)), batch_size=c.train_batch_size, shuffle=True) model = LyricPredictor(len(dictionary), c.output_size) print("Training model...") model, _, _ = train(model=model, training_data=training_data, num_epochs=c.num_epochs, lr=c.lr, grad_norm=c.grad_max_norm) print("Saving model: ", c.model_path) torch.save(model, c.model_path) print("Saving dictionary: ", c.dictionary_path) torch.save(dictionary, c.dictionary_path) print("Generating lyrics...") seed_lyrics = generate_seed_lyrics(tokenized, c.window_size, args.censored) predicted_lyrics = predict(model, seed_lyrics, dictionary, num_words=args.words, topk=c.predict_topk) predicted_lyrics = postprocess(predicted_lyrics, args.censored) print(predicted_lyrics)
def prepare_inputs(token_mapping, sentences): """ Converts a 2-D list of sentences (list of list of words) to one-hot encoded tokens of shape [n_sentences, n_words, len(token_mapping), 1]. """ tokens = [tokenize(token_mapping, sentence) for sentence in sentences] depth = len(token_mapping) one_hot_tokens = [[one_hot_encode(token, depth) for token in sentence] for sentence in tokens] one_hot_tokens = [np.asarray(ls) for ls in one_hot_tokens ] # list of [n_words, len(token_mapping), 1] return one_hot_tokens
def process_text_from_string(text): """ Process 1 text given as a string Args: text: string containing all the text Returns: A row-like array of all the metrics extracted from the text """ from preprocess import tokenize sentences, words = tokenize(text) return get_info(text, sentences, words)
def get_predictions(query, model_name, magic_string): query = clean(query) query = tokenize(query, magic_string, 'embedding' in model_name) query = np.expand_dims(query, 0) if 'conv' in model_name and 'embedding' not in model_name: query = np.expand_dims(query, 2) prediction = model.predict(query) prediction = prediction[0] indexed = list(enumerate(prediction)) weighted = sorted(indexed, key=lambda e: e[1], reverse=True) print('\n'.join([f"{map_[str(r[0])]}: {r[1]:.2f}" for r in weighted[:10]])) print()
def build_corpus_dictionary(): input_path = 'dataset/comments_array.json' json_array = preprocess.load_json_file(input_path) field_array = ['content'] str_list = preprocess.extract_from_json(json_array, field_array) texts = preprocess.tokenize(str_list) removed_texts = preprocess.remove_stop_words(texts) dictionary = corpora.Dictionary(texts) corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary) import ipdb; ipdb.set_trace()
def transform(text): text = text.lower() text = normalize_thai_number(text) text = unescape_html(text) text = remove_markup_tag(text) text = normalize_link(text) text = normalize_mention(text) text = normalize_email(text) text = normalize_laugh(text) text = normalize_number(text, place_holder='') text = normalize_emoji(text) hashtags = extract_hashtag(text) text = normalize_hashtag(text, place_holder='') tokens = tokenize(text, stopwords=None, punctuation=punctuation) tokens = replace_with_actual_hashtag(tokens, hashtags) return tokens
def main(): input_path = 'dataset/taipei_city.json' json_array = preprocess.load_json_file(input_path) field_array = ['content'] str_list, answer = preprocess.extract_from_json_with_answer(json_array['data'], field_array) texts = preprocess.tokenize(str_list) removed_texts = preprocess.remove_stop_words(texts) #dictionary = pickle.load(open('dictionary.obj', 'rb')) dictionary = corpora.Dictionary(removed_texts) data_corpus = preprocess.convert_texts_to_corpus(removed_texts, dictionary) #corpus = pickle.load(open('corpus.obj', 'rb')) result_table = pd.DataFrame() # preprocess with Tfidf model params = {"corpus": data_corpus} X, y = convert_to_X_y(TfidfModel, params, data_corpus, answer) result_table = train_with_dummy(result_table, X, y, 'tfidf') result_table = train_with_random_forest(result_table, X, y, 'tfidf') result_table = train_with_logistic_regression(result_table, X, y, 'tfidf') ''' # preprocess with lda model for num_topics in [10, 50, 100, 150, 200]: params = {"corpus": data_corpus, "num_topics": num_topics} X, y = convert_to_X_y(LdaModel, params, data_corpus, answer) result_table = train_with_dummy(result_table, X, y, 'lda_'+str(params['num_topics'])) result_table = train_with_random_forest(result_table, X, y, 'lda_'+str(params['num_topics'])) result_table = train_with_logistic_regression(result_table, X, y, 'lda_'+str(params['num_topics'])) # preprocess with lsi model for num_topics in [10, 50, 100, 150, 200]: params = {"corpus": data_corpus, "num_topics": num_topics} X, y = convert_to_X_y(LsiModel, params, data_corpus, answer) result_table = train_with_dummy(result_table, X, y, 'lsi_'+str(params['num_topics'])) result_table = train_with_random_forest(result_table, X, y, 'lsi_'+str(params['num_topics'])) result_table = train_with_logistic_regression(result_table, X, y, 'lsi_'+str(params['num_topics'])) ''' output_file = sys.argv[1] result_table.to_csv(output_file, sep='\t')
def main(argv): emot_dic = {"anger": 0, "joy": 1, "sadness": 2, "fear": 3} model = get_model("model.h5") for s in ["anger", "joy", "sadness", "fear"]: df = pd.read_csv(argv[0], sep='\t', header=None, encoding='utf-8', quoting=3) df.columns = ['id', 'text', 'polarity', 'class'] df = df[df["polarity"] == s] test = np.array(df['text']) test_type = np.array(df["polarity"]) X = [] for x in test: X.append(tokenize(x)) X = np.array(X) emot = np.zeros((len(test), 4)) for x in range(len(test_type)): emot[x, emot_dic[test_type[x]]] = 1 out = model.predict( [X.reshape((len(test), 50, 1)), emot.reshape((len(test), 4))]) y_ = np.array(df["class"]) y = np.array([int(x[0]) for x in y_]) acc = np.count_nonzero(y == out.argmax(axis=1)) / float( out.argmax(axis=1).shape[0]) print(s, acc) df["class"] = out.argmax(axis=1) df.to_csv("EI-oc_en_" + s + "_pred.txt", sep='\t', header=None, index=None)
def setup(self): corpus = self._create_corpus() self.tokens = pp.tokenize(corpus) sequences = [] for line in corpus: token_list = self.tokens.texts_to_sequences([line])[0] for i in range(1, len(token_list)): n_gram_sequence = token_list[:i+1] sequences.append(n_gram_sequence) padder = tf.keras.preprocessing.sequence.pad_sequences sequences = np.array(padder(sequences, padding='pre')) input_sequences, labels = sequences[:,:-1], sequences[:,-1] total_words = len(self.tokens.word_index) + 1 one_hot_labels = tf.keras.utils.to_categorical(labels, num_classes=total_words) return input_sequences, one_hot_labels
def pure_model(choices, query, model, magic_string, model_name, return_weights=False): query = clean(query) query = tokenize(query, magic_string, 'embedding' in model_name) query = np.expand_dims(query, 0) if 'conv' in model_name and 'embedding' not in model_name: query = np.expand_dims(query, 2) prediction = model.predict(query) prediction = prediction[0] indexed = list(enumerate(prediction)) weighted = sorted(indexed, key=lambda e: e[1], reverse=True) if not return_weights: return [choices[r[0]]['name'] for r in weighted[:10]] return [(choices[r[0]]['name'], r[1]) for r in weighted[:10]]
def build_model(lines_num = -1): dataset = open('rawdata.csv', 'r') [bidList,rawList]=tokenize(dataset,lines_num) stopped_result = stop_words(rawList,lines_num) stem_result = stem(stopped_result,lines_num) [corpus,dictionary] = doc_term_matrix(stem_result) # it seems like without stem the words makes more sense, still working on it ################ #### LDA ##### ################ ldaList = lda(corpus, dictionary,lines_num) print("load data...") with open('outfile','wb') as fp: pickle.dump(ldaList,fp)
def tokenize(): content = request.args.get('content', None, type=str) print "From view.py tokenize:%s" % content sw_list = prep.get_stopwords( "/home/skipper/study/python/project/text/news_stopwords.txt") tokens = prep.tokenize(content, sw_list, language="CN") for i in tokens: print i tokens_string = '/'.join(tokens) print tokens_string pos_tag, pos_string = prep.pos_test(tokens) print "From view.py tokenize: %s\n%s\n%s\n" % (tokens_string, pos_tag, pos_string) detail = {} detail['tokens_string'] = tokens_string detail['pos_tag'] = pos_tag detail['pos_string'] = pos_string return jsonify(detail)
def mixed_model(choices, query, model, magic_string, model_name, return_weights=False): names = [s['name'] for s in choices] fuzzy_results = process.extract(query, names, scorer=fuzz.ratio) fuzzy_sum = max(sum(r[1] for r in fuzzy_results), 0.001) fuzzy_matches_and_confidences = [(r[0], r[1] / fuzzy_sum) for r in fuzzy_results] # net query = clean(query) query = tokenize(query, magic_string, 'embedding' in model_name) query = np.expand_dims(query, 0) if 'conv' in model_name and 'embedding' not in model_name: query = np.expand_dims(query, 2) prediction = model.predict(query) prediction = prediction[0] indexed = list(enumerate(prediction)) weighted = sorted(indexed, key=lambda e: e[1], reverse=True) net_weighted = [(choices[r[0]]['name'], r[1]) for r in weighted] sorted_weighted = sorted(fuzzy_matches_and_confidences + net_weighted, key=lambda e: e[1], reverse=True) # build results list, unique results = [] weights = [] for r in sorted_weighted: if r[0] not in results: results.append(r[0]) weights.append(r[1]) if not return_weights: return results return list(zip(results, weights))
def load_train_messages(): filename = '.data/messages.txt' lines = [] vocab = {} dics = [] org_lines = None with open(filename) as f: itr = f.read().split('\n') itr = filter(lambda l: l, itr) org_lines = list(itr) for l in org_lines: text, d = preprocess.preprocess(l) dics.append(d) ws = preprocess.tokenize(text) lines.append(' '.join(ws)) for w in ws: if w not in vocab: vocab[w] = len(vocab) return (lines, vocab, dics)
def build_model(lines_num = -1): #dataset = open('rawdata.csv', 'r') rawList=tokenize(lines_num) print(rawList[0]) stopped_result = stop_words(rawList,lines_num) #stem_result = stem(stopped_result,lines_num) [corpus,dictionary] = doc_term_matrix(stopped_result) # it seems like without stem the words makes more sense, still working on it ################ #### LDA ##### ################ #ldaList = lda(corpus, dictionary,lines_num) lsiList = lsi(corpus, dictionary,lines_num) save_file(lsiList)
h2 = m.output_step(x, gh, volatile='on') wid = np.argmax(F.softmax(m.W(h2)).data[0]) result_words.append(id2wd.get(wid, UNKNOWN_WORD)) loop += 1 return ' '.join(result_words) msg_lines, msg_vocab, msg_dics = helpers.load_train_messages() msg_vocab[EOS] = len(msg_vocab) cmd_lines, cmd_vocab, cmd_id2wd = helpers.load_train_commands() id = len(cmd_vocab) cmd_vocab[EOS] = id cmd_id2wd[id] = EOS print('> ', end='') test_msg = input() # 'ミーティングは来週の月曜日14時にやる' text, dic = preprocess.preprocess(test_msg) print('dic = {0}'.format(dic)) ws = preprocess.tokenize(text) demb = 100 for epoch in range(20): m = model.Attention(msg_vocab, cmd_vocab, demb) filename = ".dest/m2c-{}.model".format(epoch) serializers.load_npz(filename, m) print(epoch, ': ', translate(m, cmd_id2wd, ws))
import math import preprocess from datetime import datetime startTime = datetime.now() # Parameter for tuning k = 2 #Tokenize the queries and documents: queryList = preprocess.tokenize('qrys.txt') docList = preprocess.tokenize('quotes.txt') # Compute average document length (used to compute the tdf portion of the score further down) sum = 0 for doc in docList: sum += len(doc) meanDocLen = float(sum)/len(docList) output = file('tfidf.top','w') for query in queryList: queryNumber = query.pop(0) queryUnique = list(set(query)) # This removes duplicate words in the query # Get term frequencies for the query queryFreqs = {} for word in queryUnique: queryFreqs[word] = query.count(word) # Compute document frequency for the words in the query
header_index = int(line.split('\t')[0][2:]) ##print header, header_index, text elif line.startswith("#P"): idx, text = line.split('\t') thisparaid, sentid = idx.split() thisparaid = int(thisparaid.split('\t')[0][2:])+1 sentid = int(sentid.split('\t')[0][2:])+1 #text = tokenize(text, lang) ##print text ##print text2naf(text, sentid, thisparaid) if lang == 'cmn': lines.append(text) else: tagged_text = pos_tag(tokenize(text,lang), lang) tokens, tags = zip(*tagged_text) tl, newwordid = text2naf(" ".join(tokens), sentid, thisparaid,wordid) textlayer.append(tl) termlayer.append(term2naf(tokens, tags, wordid)) wordid = newwordid if lang == 'cmn': tagged_texts = pos_tag(tokenize(lines,lang, batch=True), lang, batch=True) for tagged_text in tagged_texts: tokens, tags = zip(*tagged_text) tl, newwordid = text2naf(" ".join(tokens), sentid, thisparaid,wordid)