Python prepare_dataset 예제들, common_functions.prepare_dataset Python 예제들

예제 #1

0

파일 보기

파일: ent_features_rus.py 프로젝트: ivankoval/ReadAbility

def get_test_data():

    grades = ['1', '3', '6', '9']
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/ApiData/rus/ent/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['ent-rus']
    features_collection.drop()

    for text in dataset:
        features = extract_features(text.data.encode('utf-8'))
        text_features = {"grade": text.grade, "features": features}
        print str(features)
        features_collection.insert_one(text_features)

예제 #2

0

파일 보기

파일: lematization_rus.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():

    grades = ['1', '3', '6', '9']

    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['lem-rus']
    features_collection.drop()

    for text in dataset:
        text_features = {"grade": text.grade,
                         "features": extract_features(text.data)}
        features_collection.insert_one(text_features)

예제 #3

0

파일 보기

파일: shallow_features_eng.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():
    # grades = ['K-1', '4-5', '9-10']
    grades = ['2-3', '6-8', '11-CCR']
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['shallow-eng']
    features_collection.drop()

    for text in dataset:
        print text.grade
        text_features = {"grade": text.grade,
                         "features": extract_features(text.data)}
        features_collection.insert_one(text_features)

예제 #4

0

파일 보기

def get_test_data():

    grades = ['1', '3', '6', '9']
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/ApiData/rus/ent/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['ent-rus']
    features_collection.drop()

    for text in dataset:
        features = extract_features(text.data.encode('utf-8'))
        text_features = {"grade": text.grade, "features": features}
        print str(features)
        features_collection.insert_one(text_features)

예제 #5

0

파일 보기

파일: shallow_features_rus.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():
    grades = ['1', '3', '6', '9']
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/"

    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['shallow-rus']
    features_collection.drop()

    for text in dataset:
        features = extract_features(text.data)
        text_features = {"grade": text.grade, "features": features}
        print text.grade
        features_collection.insert_one(text_features)

예제 #6

0

파일 보기

파일: build_model.py 프로젝트: Type-of-Read/ReadAbility-23

def create_lm(path_to_data, grades, f_type):
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')

    for grade in grades:
        print grade + " grade"
        start_total = time.time()
        client.drop_database(f_type + '_' + grade)
        db = client[f_type + '_' + grade]

        for n in xrange(1, 3):
            print str(n) + " gram"
            start = time.time()
            fd_dict = dict()
            # TODO separate dataset on grades

            for text in dataset:
                if text.grade == grade:
                    fd_dict[text.name] = FreqDist()
            fd_dict['all'] = FreqDist()

            for text in dataset:
                if text.grade == grade:

                    tokens = nltk.word_tokenize(text.data)
                    tokens_l = [token.lower() for token in tokens]

                    for key in fd_dict:
                        if key != text.name:
                            n_grams = ngrams(tokens_l, n)
                            fd_dict[key].update(n_grams)

            for key in fd_dict:
                lm_collection = db[key]
                fd = fd_dict[key]
                sgt = SimpleGoodTuringProbDist(fd)
                prob_many = list()

                for fd_key in fd:
                    prob_many.append({"type": fd_key, "n-gram": n, "count": fd[fd_key], "prob": sgt.prob(fd_key)})

                if prob_many:
                    lm_collection.insert_many(prob_many)

            print str(time.time() - start) + " sec"

        print str(time.time() - start_total) + " sec total"

예제 #7

0

파일 보기

파일: affixes.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():

    grades = ['1', '3', '6', '9']
    # grades = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10-11']
    # path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_test/rus/word/"
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['affixes']
    features_collection.drop()

    for text in dataset:
        text_features = {"grade": text.grade,
                         "features": extract_features(text.data)}
        features_collection.insert_one(text_features)

예제 #8

0

파일 보기

def get_test_data():
    # grades = ['K-1', '4-5', '9-10']
    grades = ['2-3', '6-8', '11-CCR']
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['shallow-eng']
    features_collection.drop()

    for text in dataset:
        print text.grade
        text_features = {
            "grade": text.grade,
            "features": extract_features(text.data)
        }
        features_collection.insert_one(text_features)

예제 #9

0

파일 보기

파일: lm_features.py 프로젝트: ivankoval/ReadAbility

def get_test_data(f_type, path_to_data, grades):

    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features[f_type]
    features_collection.drop()

    start = time.time()

    for text in dataset:
        print text.name
        text_features = {"grade": text.grade,
                         "features": extract_features(f_type, text,  grades)}
        features_collection.insert_one(text_features)

    print str(time.time() - start) + " sec total"

예제 #10

0

파일 보기

파일: inclusions.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():

    clf = make_features()

    grades = ['1', '3', '6', '9']
    # path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_test/rus/word/"
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['inclusions']
    features_collection.drop()

    for text in dataset:
        features = extract_features(text.data, clf)
        text_features = {"grade": text.grade, "features": features}
        print str(features)
        features_collection.insert_one(text_features)

예제 #11

0

파일 보기

파일: ig_value.py 프로젝트: Type-of-Read/ReadAbility-23

def get_all_ig(path, grades):

    dataset = prepare_dataset(path, grades)
    words = prepare_ig_candidates(dataset)
    ig_words = dict()

    for word in words:
        ig_words[word] = calc_ig_value(word, grades, dataset)

    sorted_ig_words = sorted(ig_words.items(), key=operator.itemgetter(1), reverse=True)

    top_ig_words = list()
    for word in sorted_ig_words:
        if len(top_ig_words) < 1000:
            top_ig_words.append(word[0])
        else:
            break
    return top_ig_words

예제 #12

0

파일 보기

파일: inclusions.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():

    clf = make_features()

    grades = ["1", "3", "6", "9"]
    # path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_test/rus/word/"
    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/rus/word/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient("mongodb://localhost:27017/")
    features_collection = client.features["inclusions"]
    features_collection.drop()

    for text in dataset:
        features = extract_features(text.data, clf)
        text_features = {"grade": text.grade, "features": features}
        print str(features)
        features_collection.insert_one(text_features)

예제 #13

0

파일 보기

파일: lm_features.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data(f_type, path_to_data, grades):

    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features[f_type]
    features_collection.drop()

    start = time.time()

    for text in dataset:
        print text.name
        text_features = {
            "grade": text.grade,
            "features": extract_features(f_type, text, grades)
        }
        features_collection.insert_one(text_features)

    print str(time.time() - start) + " sec total"

예제 #14

0

파일 보기

파일: ig_value.py 프로젝트: Type-of-Read/ReadAbility-23

def get_all_ig(path, grades):

    dataset = prepare_dataset(path, grades)
    words = prepare_ig_candidates(dataset)
    ig_words = dict()

    for word in words:
        ig_words[word] = calc_ig_value(word, grades, dataset)

    sorted_ig_words = sorted(ig_words.items(),
                             key=operator.itemgetter(1),
                             reverse=True)

    top_ig_words = list()
    for word in sorted_ig_words:
        if len(top_ig_words) < 1000:
            top_ig_words.append(word[0])
        else:
            break
    return top_ig_words

예제 #15

0

파일 보기

파일: pos_features_rus.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():
    pos_set = [nouns, verbs, adjectives, adverbs, prepositions]

    grades = ['1', '3', '6', '9']

    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/ApiData/rus/pos/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['pos-rus']
    features_collection.drop()

    for text in dataset:
        features = []
        for pos in pos_set:
            features += extract_features(text.data.encode('utf-8'), pos)

        text_features = {"grade": text.grade, "features": features}
        print str(features)
        features_collection.insert_one(text_features)

예제 #16

0

파일 보기

파일: pos_features_rus.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():
    pos_set = [nouns, verbs, adjectives, adverbs, prepositions]

    grades = ['1', '3', '6', '9']

    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/ApiData/rus/pos/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['pos-rus']
    features_collection.drop()

    for text in dataset:
        features = []
        for pos in pos_set:
            features += extract_features(text.data.encode('utf-8'), pos)

        text_features = {"grade": text.grade, "features": features}
        print str(features)
        features_collection.insert_one(text_features)

예제 #17

0

파일 보기

파일: pos_features_eng.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():
    pos_set = [nouns, verbs, adverbs, adjectives, prepositions]

    # grades = ['K-1', '4-5', '9-10']
    grades = ['2-3', '6-8', '11-CCR']
    # grades = ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR']

    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['pos-eng']
    features_collection.drop()

    for text in dataset:
        features = []
        for pos in pos_set:
            features += extract_features(text.data, pos)
        print features
        text_features = {"grade": text.grade, "features": features}
        features_collection.insert_one(text_features)

예제 #18

0

파일 보기

파일: pos_features_eng.py 프로젝트: Type-of-Read/ReadAbility-23

def get_test_data():
    pos_set = [nouns, verbs, adverbs, adjectives, prepositions]

    # grades = ['K-1', '4-5', '9-10']
    grades = ['2-3', '6-8', '11-CCR']
    # grades = ['K-1', '2-3', '4-5', '6-8', '9-10', '11-CCR']

    path_to_data = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/eng/byGrade/"
    dataset = prepare_dataset(path_to_data, grades)

    client = MongoClient('mongodb://localhost:27017/')
    features_collection = client.features['pos-eng']
    features_collection.drop()

    for text in dataset:
        features = []
        for pos in pos_set:
            features += extract_features(text.data, pos)
        print features
        text_features = {"grade": text.grade, "features": features}
        features_collection.insert_one(text_features)