def get_test_data(): grades = ['1', '3', '6', '9'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/ApiData/rus/ent/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['ent-rus'] features_collection.drop() for text in dataset: features = extract_features(text.data.encode('utf-8')) text_features = {"grade": text.grade, "features": features} print str(features) features_collection.insert_one(text_features)
def get_test_data(): grades = ['1', '3', '6', '9'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['lem-rus'] features_collection.drop() for text in dataset: text_features = {"grade": text.grade, "features": extract_features(text.data)} features_collection.insert_one(text_features)
def get_test_data(): # grades = ['K-1', '4-5', '9-10'] grades = ['2-3', '6-8', '11-CCR'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['shallow-eng'] features_collection.drop() for text in dataset: print text.grade text_features = {"grade": text.grade, "features": extract_features(text.data)} features_collection.insert_one(text_features)
def get_test_data(): grades = ['1', '3', '6', '9'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['shallow-rus'] features_collection.drop() for text in dataset: features = extract_features(text.data) text_features = {"grade": text.grade, "features": features} print text.grade features_collection.insert_one(text_features)
def create_lm(path_to_data, grades, f_type): dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') for grade in grades: print grade + " grade" start_total = time.time() client.drop_database(f_type + '_' + grade) db = client[f_type + '_' + grade] for n in xrange(1, 3): print str(n) + " gram" start = time.time() fd_dict = dict() # TODO separate dataset on grades for text in dataset: if text.grade == grade: fd_dict[text.name] = FreqDist() fd_dict['all'] = FreqDist() for text in dataset: if text.grade == grade: tokens = nltk.word_tokenize(text.data) tokens_l = [token.lower() for token in tokens] for key in fd_dict: if key != text.name: n_grams = ngrams(tokens_l, n) fd_dict[key].update(n_grams) for key in fd_dict: lm_collection = db[key] fd = fd_dict[key] sgt = SimpleGoodTuringProbDist(fd) prob_many = list() for fd_key in fd: prob_many.append({"type": fd_key, "n-gram": n, "count": fd[fd_key], "prob": sgt.prob(fd_key)}) if prob_many: lm_collection.insert_many(prob_many) print str(time.time() - start) + " sec" print str(time.time() - start_total) + " sec total"
def get_test_data(): grades = ['1', '3', '6', '9'] # grades = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10-11'] # path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_test/rus/word/" path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['affixes'] features_collection.drop() for text in dataset: text_features = {"grade": text.grade, "features": extract_features(text.data)} features_collection.insert_one(text_features)
def get_test_data(): # grades = ['K-1', '4-5', '9-10'] grades = ['2-3', '6-8', '11-CCR'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['shallow-eng'] features_collection.drop() for text in dataset: print text.grade text_features = { "grade": text.grade, "features": extract_features(text.data) } features_collection.insert_one(text_features)
def get_test_data(f_type, path_to_data, grades): dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features[f_type] features_collection.drop() start = time.time() for text in dataset: print text.name text_features = {"grade": text.grade, "features": extract_features(f_type, text, grades)} features_collection.insert_one(text_features) print str(time.time() - start) + " sec total"
def get_test_data(): clf = make_features() grades = ['1', '3', '6', '9'] # path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_test/rus/word/" path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['inclusions'] features_collection.drop() for text in dataset: features = extract_features(text.data, clf) text_features = {"grade": text.grade, "features": features} print str(features) features_collection.insert_one(text_features)
def get_all_ig(path, grades): dataset = prepare_dataset(path, grades) words = prepare_ig_candidates(dataset) ig_words = dict() for word in words: ig_words[word] = calc_ig_value(word, grades, dataset) sorted_ig_words = sorted(ig_words.items(), key=operator.itemgetter(1), reverse=True) top_ig_words = list() for word in sorted_ig_words: if len(top_ig_words) < 1000: top_ig_words.append(word[0]) else: break return top_ig_words
def get_test_data(): clf = make_features() grades = ["1", "3", "6", "9"] # path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_test/rus/word/" path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient("mongodb://localhost:27017/") features_collection = client.features["inclusions"] features_collection.drop() for text in dataset: features = extract_features(text.data, clf) text_features = {"grade": text.grade, "features": features} print str(features) features_collection.insert_one(text_features)
def get_test_data(f_type, path_to_data, grades): dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features[f_type] features_collection.drop() start = time.time() for text in dataset: print text.name text_features = { "grade": text.grade, "features": extract_features(f_type, text, grades) } features_collection.insert_one(text_features) print str(time.time() - start) + " sec total"
def get_test_data(): pos_set = [nouns, verbs, adjectives, adverbs, prepositions] grades = ['1', '3', '6', '9'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/ApiData/rus/pos/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['pos-rus'] features_collection.drop() for text in dataset: features = [] for pos in pos_set: features += extract_features(text.data.encode('utf-8'), pos) text_features = {"grade": text.grade, "features": features} print str(features) features_collection.insert_one(text_features)
def get_test_data(): pos_set = [nouns, verbs, adverbs, adjectives, prepositions] # grades = ['K-1', '4-5', '9-10'] grades = ['2-3', '6-8', '11-CCR'] # grades = ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR'] path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/" dataset = prepare_dataset(path_to_data, grades) client = MongoClient('mongodb://localhost:27017/') features_collection = client.features['pos-eng'] features_collection.drop() for text in dataset: features = [] for pos in pos_set: features += extract_features(text.data, pos) print features text_features = {"grade": text.grade, "features": features} features_collection.insert_one(text_features)