def pre_processing_pipeline_text(X_train, X_test):
    x_train_process = []
    for text in X_train:
        tmp = pp.to_lower_case(text)
        tmp = pp.to_lower_case(tmp)
        tmp = pp.substitute_thousands(tmp)
        tmp = pp.fix_common_mistakes(tmp)
        tmp = pp.unstack(tmp)
        tmp = pp.remove_white_space(tmp)
        tmp = pp.remove_punctuation(False, tmp)
        tmp = pp.clean_text(False, tmp)
        tmp = pp.stemming(tmp) # TODO try without it

        x_train_process.append(tmp)

    x_test_process = []
    for text in X_test:
        tmp = pp.to_lower_case(text)
        tmp = pp.to_lower_case(tmp)
        tmp = pp.substitute_thousands(tmp)
        tmp = pp.fix_common_mistakes(tmp)
        tmp = pp.unstack(tmp)
        tmp = pp.remove_white_space(tmp)
        tmp = pp.remove_punctuation(False, tmp)
        tmp = pp.clean_text(False, tmp)
        tmp = pp.stemming(tmp) # TODO try without it

        x_test_process.append(tmp)

    return x_train_process, x_test_process
Beispiel #2
0
def read_unstructure_texts(files):
    texts = []
    for file in files:
        with open(file, encoding='utf8') as f:
            data = f.read()
        data = stemming(remove_stop_words(tokenize(data)))
        texts.append(data)
    return texts
def pre_processing_pipeline(df):
    for index, row in df.iterrows():
        df.loc[index, 'text'] = pp.to_lower_case(row['text'])
        df.loc[index, 'text'] = pp.substitute_thousands(row['text'])
        df.loc[index, 'text'] = pp.fix_common_mistakes(row['text'])
        df.loc[index, 'text'] = pp.unstack(row['text'])
        df.loc[index, 'text'] = pp.remove_white_space(row['text'])
        df.loc[index, 'text'] = pp.remove_punctuation(False, row['text'])
        df.loc[index, 'text'] = pp.clean_text(False, row['text'])
        df.loc[index, 'text'] = pp.stemming(row['text']) # TODO try without it

    return df
Beispiel #4
0
def read_to_texts(files):
    texts = []
    for file in files:
        with open(file, encoding='utf8') as f:
            data = f.read()
        parsed_result = parse_paper(data)
        text = ''
        for section, content in parsed_result['structure'].items():
            text += content
        text = stemming(remove_stop_words(tokenize(text)))
        texts.append(text)
    return texts
def load_file(path):
    texts = []
    num_files = 0
    num_methods = 0
    num_abstract = 0
    paper_attributes = {}
    country_counter = []
    for folder in os.listdir(path):
        for file in os.listdir(os.path.join(path, folder)):
            num_files += 1
            if num_files <= NUM_START:
                continue
            parsed_result = file2text(os.path.join(path, folder, file))
            method_text = get_methods(parsed_result['text'])
            if method_text:
                token_text = stemming(remove_stop_words(tokenize(method_text)))
                texts.append(token_text)
                num_methods += 1
                author_countries = get_country(parsed_result['text'][:1000])
                if len(author_countries) > 1:
                    num_abstract += 1
                    paper_id = file.split('.')[0]
                    paper_attributes[paper_id] = {'countries': author_countries}
                    for c in author_countries:
                        if c in country_counter:
                            country_counter[c] += 1
                        else:
                            country_counter[c] = 1
            if num_files % 1000 == 0:
                print(num_files, num_methods, num_abstract)
            if num_files > NUM_STOP:
                with open(os.path.join(os.getcwd(), 'metadata', OUTPUT_FILE), 'w') as f:
                    json.dump(paper_attributes, f)
                print(country_counter)
                return
    print("finished extraction")
    print(num_files, num_methods, num_abstract)
Beispiel #6
0
    data = json.load(instances)

word_list = []  #create an array with all the words
tags = []
xy = []
for i in data['data']:
    tag = i['tag']
    tags.append(tag)
    for user_response in i['user_responses']:
        normalized = normalization(user_response)
        words = tokenization(normalized)
        word_list.extend(words)
        xy.append(
            (words, tag))  #array of user responses with the respective tags

word_list = [stemming(word) for word in word_list]
word_list = sorted(set(word_list))
print(tags)
print(word_list)
print(xy)

x_train = []
y_train = []
for (tokenized, tag) in xy:
    bag = bag_of_words(tokenized, word_list)
    x_train.append(bag)

    tag_label = tags.index(tag)
    y_train.append(tag_label)

x_train = np.array(x_train)
Beispiel #7
0
word_list = []  #create an array with all the words
tags = []
xy = []
for i in data['data']:
    tag = i['tag']
    tags.append(tag)
    for user_response in i['user_responses']:
        normalized = normalization(user_response)
        words = tokenization(normalized)
        word_list.extend(
            words)  # not apeend becous dont want a arraylist in a array
        xy.append(
            (words, tag))  #array of user responses with the respective tags

word_list = [stemming(word) for word in word_list]  # to remove the symbols
word_list = sorted(set(word_list))  # to remove duplicate elements
print(tags)
print(word_list)
print(xy)
# train the data
x_train = []
y_train = []
for (tokenized, tag) in xy:
    bag = bag_of_words(tokenized, word_list)
    x_train.append(bag)

    tag_label = tags.index(tag)
    y_train.append(tag_label)

x_train = np.array(x_train)
def tokenize_text(text):
    token_text = stemming(remove_stop_words(tokenize(text)))
    filetered_text = remove_lf_words(token_text, 2)
    return filetered_text