def feature_three(text):
    words = get_words(text)
    poly_syllables_words = 0
    for word in words:
        if count_syllables(word) > 1:
            poly_syllables_words += 1
    return poly_syllables_words / total_sentences(text)
def feature_two(text):
    words = get_words(text)
    poly_syllables_words = 0
    for word in words:
        if count_syllables(word) > 1:
            poly_syllables_words += 1
    return poly_syllables_words/len(words)*100
def feature_two(text):
    words = get_words(text)
    poly_syllables_words = 0
    for word in words:
        if count_syllables(word) > 1:
            poly_syllables_words += 1
    return poly_syllables_words / len(words) * 100
def feature_three(text):
    words = get_words(text)
    poly_syllables_words = 0
    for word in words:
        if count_syllables(word) > 1:
            poly_syllables_words += 1
    return poly_syllables_words/total_sentences(text)
def feature_seven(text):
    words = get_words(text)
    syllables_sum = 0
    for word in words:
        syllables_sum += count_syllables(word)

    return 0.39 * len(words)/total_sentences(text) + 11.8 * syllables_sum/len(words) - 15.59
def feature_seven(text):
    words = get_words(text)
    syllables_sum = 0
    for word in words:
        syllables_sum += count_syllables(word)

    return 0.39 * len(words) / total_sentences(
        text) + 11.8 * syllables_sum / len(words) - 15.59
def extract_features(data, pos):

    extr_words = extract_words(data, pos)

    words = extr_words.words
    unique_words = extr_words.unique_words

    total_w = len(get_words(data))
    total_unique_w = len(np.unique(get_words(data)))
    total_s = total_sentences(data)

    feature1 = words/total_w*100
    feature2 = unique_words/total_w*100
    feature3 = unique_words/total_unique_w
    feature4 = words/total_s
    feature5 = unique_words/total_s

    return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data, pos):

    extr_words = extract_words(data, pos)

    words = extr_words.words
    unique_words = extr_words.unique_words

    total_w = len(get_words(data))
    total_unique_w = len(np.unique(get_words(data)))
    total_s = total_sentences(data)

    feature1 = words / total_w * 100
    feature2 = unique_words / total_w * 100
    feature3 = unique_words / total_unique_w
    feature4 = words / total_s
    feature5 = unique_words / total_s

    return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data, pos_type):
    root = ElementTree.fromstring(data)
    pure_text = root[0].text

    extr_words = extract_words(data, pos_type)

    words = extr_words.words
    unique_words = extr_words.unique_words

    total_w = len(get_words(pure_text))
    total_unique_w = len(np.unique(get_words(pure_text)))
    total_s = total_sentences(pure_text)

    feature1 = words/total_w*100
    feature2 = unique_words/total_w*100
    feature3 = unique_words/total_unique_w
    feature4 = words/total_s
    feature5 = unique_words/total_s

    return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data, pos_type):
    root = ElementTree.fromstring(data)
    pure_text = root[0].text

    extr_words = extract_words(data, pos_type)

    words = extr_words.words
    unique_words = extr_words.unique_words

    total_w = len(get_words(pure_text))
    total_unique_w = len(np.unique(get_words(pure_text)))
    total_s = total_sentences(pure_text)

    feature1 = words / total_w * 100
    feature2 = unique_words / total_w * 100
    feature3 = unique_words / total_unique_w
    feature4 = words / total_s
    feature5 = unique_words / total_s

    return [feature1, feature2, feature3, feature4, feature5]
def extract_features(data):
    root = ElementTree.fromstring(data)
    pure_text = root[0].text

    ne = extract_entities_api(data)
    tw = len(get_words(pure_text))
    ts = total_sentences(pure_text)

    feature1 = ne/tw*100
    feature2 = ne/ts*100

    return [feature1, feature2]
Exemple #12
0
def extract_features(data):
    root = ElementTree.fromstring(data)
    pure_text = root[0].text

    ne = extract_entities_api(data)
    tw = len(get_words(pure_text))
    ts = total_sentences(pure_text)

    feature1 = ne / tw * 100
    feature2 = ne / ts * 100

    return [feature1, feature2]
def feature_five(text):
    path_difficult_words = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/DaleChallEasyWordList.txt"
    words = get_words(text)
    difficult_words_sum = 0

    with open(path_difficult_words, 'r') as f:
        difficult_words = f.readlines()

    for word in words:
        if word not in difficult_words:
            difficult_words_sum += 1
    return 0.0496 * len(words)/total_sentences(text) + 0.1579 * difficult_words_sum/len(words) * 100 + 3.6365
def extract_features(data):
    extr_entities = extract_entities(data)

    ne = extr_entities.ne
    tw = len(get_words(data))
    ts = total_sentences(data)

    feature1 = ne/tw*100
    feature2 = ne/ts*100

    print str(feature1) + " " + str(feature2)
    return [feature1, feature2]
def extract_features(data):
    extr_entities = extract_entities(data)

    ne = extr_entities.ne
    tw = len(get_words(data))
    ts = total_sentences(data)

    feature1 = ne / tw * 100
    feature2 = ne / ts * 100

    print str(feature1) + " " + str(feature2)
    return [feature1, feature2]
def feature_five(text):
    path_difficult_words = "/Users/Ivan/PycharmProject/ReadAbility/DataSets_raw/DaleChallEasyWordList.txt"
    words = get_words(text)
    difficult_words_sum = 0

    with open(path_difficult_words, 'r') as f:
        difficult_words = f.readlines()

    for word in words:
        if word not in difficult_words:
            difficult_words_sum += 1
    return 0.0496 * len(words) / total_sentences(
        text) + 0.1579 * difficult_words_sum / len(words) * 100 + 3.6365
def extract_features(data):
    prefixes = load_data('pref.txt')
    suffixes = load_data('suff.txt')
    prefix_num = 0
    suffix_num = 0
    words = get_words(data)

    for word in words:
        word = word.lower()
        word = (word.encode('utf-8')).lower()

        prefix_cand = []
        suffix_cand = []

        for prefix in prefixes:
            prefix = prefix.replace('\r\n', '')
            try:
                if word.index(prefix) == 0:
                    prefix_cand.append(prefix)
            except:
                pass

        if len(prefix_cand) != 0:
            prefix_num += 1

        for suffix in suffixes:
            suffix = suffix.replace('\r\n', '')
            try:
                if word.index(suffix) > 2:
                    suffix_cand.append(suffix)
            except:
                pass

        if len(suffix_cand) != 0:
            suffix_num += 1

    # print [prefix_num/len(words), suffix_num/len(words)]
    # return [prefix_num/len(words), suffix_num/len(words)]
    print suffix_num/len(words)
    return [suffix_num/len(words)]
def feature_eight(text):
    return len(get_words(text))
def extract_features(data, clf):
    prefixes = load_data('pref.txt')
    suffixes = load_data('suff.txt')

    dict_1 = dict()
    dict_2 = dict()
    i = 0
    for prefix in prefixes:
        prefix = prefix.replace('\r\n', '')
        dict_1[prefix] = i
        i += 1
    i = 0
    for suffix in suffixes:
        suffix = suffix.replace('\r\n', '')
        dict_2[suffix] = i
        i += 1

    words = get_words(data)
    borrowed_num = 0
    original_num = 0

    for word in words:
        word = word.lower()
        word = word.encode('utf-8')

        prefix_cand = []
        suffix_cand = []
        for prefix in prefixes:
            prefix = prefix.replace('\r\n', '')
            try:
                if word.index(prefix) == 0:
                    prefix_cand.append(prefix)
            except:
                pass

        if len(prefix_cand) == 0:
            prefix = 'none'
        else:
            prefix = max(prefix_cand, key=len)

        for suffix in suffixes:
            suffix = suffix.replace('\r\n', '')
            try:
                if word.index(suffix) > 2:
                    suffix_cand.append(suffix)
            except:
                pass

        if len(suffix_cand) == 0:
            suffix = 'none'
        else:
            suffix = max(suffix_cand, key=len)

        arr = []
        for key in dict_1:
            if key != prefix:
                arr.append(0)
            else:
                arr.append(1)

        for key in dict_2:
            if key != suffix:
                arr.append(0)
            else:
                arr.append(1)

        if suffix != 'none' or prefix != 'none':
            if clf.predict(arr)[0] == 'borrowed':
                borrowed_num += 1
            if clf.predict(arr)[0] == 'original':
                original_num += 1
    return [borrowed_num/len(words)*100]
def feature_six(text):
    return len(get_words(text))/total_sentences(text)
def extract_features(data, clf):
    prefixes = load_data("pref.txt")
    suffixes = load_data("suff.txt")

    dict_1 = dict()
    dict_2 = dict()
    i = 0
    for prefix in prefixes:
        prefix = prefix.replace("\r\n", "")
        dict_1[prefix] = i
        i += 1
    i = 0
    for suffix in suffixes:
        suffix = suffix.replace("\r\n", "")
        dict_2[suffix] = i
        i += 1

    words = get_words(data)
    borrowed_num = 0
    original_num = 0

    for word in words:
        word = word.lower()
        word = word.encode("utf-8")

        prefix_cand = []
        suffix_cand = []
        for prefix in prefixes:
            prefix = prefix.replace("\r\n", "")
            try:
                if word.index(prefix) == 0:
                    prefix_cand.append(prefix)
            except:
                pass

        if len(prefix_cand) == 0:
            prefix = "none"
        else:
            prefix = max(prefix_cand, key=len)

        for suffix in suffixes:
            suffix = suffix.replace("\r\n", "")
            try:
                if word.index(suffix) > 2:
                    suffix_cand.append(suffix)
            except:
                pass

        if len(suffix_cand) == 0:
            suffix = "none"
        else:
            suffix = max(suffix_cand, key=len)

        arr = []
        for key in dict_1:
            if key != prefix:
                arr.append(0)
            else:
                arr.append(1)

        for key in dict_2:
            if key != suffix:
                arr.append(0)
            else:
                arr.append(1)

        if suffix != "none" or prefix != "none":
            if clf.predict(arr)[0] == "borrowed":
                borrowed_num += 1
            if clf.predict(arr)[0] == "original":
                original_num += 1
    return [borrowed_num / len(words) * 100]
def feature_four(text):
    words = get_words(text)
    characters_sum = 0
    for word in words:
        characters_sum += len(word)
    return characters_sum/len(words)
def feature_one(text):
    words = get_words(text)
    syllables_sum = 0
    for word in words:
        syllables_sum += count_syllables(word)
    return syllables_sum / len(words)
def feature_one(text):
    words = get_words(text)
    syllables_sum = 0
    for word in words:
        syllables_sum += count_syllables(word)
    return syllables_sum/len(words)
def feature_four(text):
    words = get_words(text)
    characters_sum = 0
    for word in words:
        characters_sum += len(word)
    return characters_sum / len(words)
def feature_six(text):
    return len(get_words(text)) / total_sentences(text)
def feature_eight(text):
    return len(get_words(text))