def binary_co_occurence(headline, body):
     # Count how many times a token in the title
     # appears in the body text.
     bin_count = 0
     bin_count_early = 0
     for headline_token in clean_string.clean(headline).split(" "):
         if headline_token in clean_string.clean(body):
             bin_count += 1
         if headline_token in clean_string.clean(body)[:255]:
             bin_count_early += 1
     return [bin_count, bin_count_early]
 def binary_co_occurence_stops(headline, body):
     # Count how many times a token in the title
     # appears in the body text. Stopwords in the title
     # are ignored.
     bin_count = 0
     bin_count_early = 0
     for headline_token in clean_string.remove_stopwords(
             clean_string.clean(headline).split(" ")):
         if headline_token in clean_string.clean(body):
             bin_count += 1
             bin_count_early += 1
     return [bin_count, bin_count_early]
    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph

        clean_body = clean_string.clean(body)
        clean_headline = clean_string.clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features
Ejemplo n.º 4
0
def clean(file_name):
    docs = []
    line_count = 0
    stop_words = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.stem.PorterStemmer()
    with open(file_name, encoding='utf8', errors='replace') as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            if (line_count != 0):
                row_content = clean_string.clean(row[1], stemmer, stop_words)
                docs.append([row[0], row_content])
            else:
                docs.append([row[0], row[1]])
            line_count += 1
    print("Writing to new file...")
    with open('cleaned_file.csv', mode='w', encoding='utf8',
              newline='') as cleaned_file:
        news_writer = csv.writer(cleaned_file, delimiter=',')
        for entry in docs:
            news_writer.writerow([entry[0], entry[1]])
Ejemplo n.º 5
0
    input_complaint_list.append(
        speech_part(bucket_name, file_name + "-" + str(i + 1) + ".mp3", path))

#----------------------------------------------------------------------------------------------------------

#Summarization, Cleaning and Classifier Input Generation Part

path = "./"

clean_complaint_list = []
summarized_complaint_list = []

#Summarizes the complaints and cleans it for using the trained model on it
for i in range(NoT):
    summarized_complaint = textsummarization.summarize(input_complaint_list[i])
    clean_complaint = clean_string.clean(summarized_complaint)
    summarized_complaint_list.append(summarized_complaint)
    clean_complaint_list.append(clean_complaint)

#Writes the cleaned complaints to a .csv file
with open('./test_complaint.csv', 'w') as f:
    for i in range(NoT):
        f.write(clean_complaint_list[i] + '\n')

bucket_name = "complaints-input"
file_name = "test_complaint.csv"

#Uploads the complaint (.csv) file to S3 Bucket
upload_s3.upload(path, bucket_name, file_name)
time.sleep(20)
def count_feature_generate(name, headlines, bodies):
    """
    generate count feature by given dataset

    name: name of data set, ex) train, competition ...
    headlines: array contains titles12
    bodies: array contains bodies +) heads and bodies most be paired.
    """
    if len(headlines) != len(bodies):
        print(
            "Check headlines size and bodies size for count_feature_generate")
        sys.exit(1)

    def calculate_polarity(text):
        tokens = clean_string.get_tokenized_lemmas(text)
        return sum([t in _refuting_words for t in tokens]) % 2

    def binary_co_occurence(headline, body):
        # Count how many times a token in the title
        # appears in the body text.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean_string.clean(headline).split(" "):
            if headline_token in clean_string.clean(body):
                bin_count += 1
            if headline_token in clean_string.clean(body)[:255]:
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def binary_co_occurence_stops(headline, body):
        # Count how many times a token in the title
        # appears in the body text. Stopwords in the title
        # are ignored.
        bin_count = 0
        bin_count_early = 0
        for headline_token in clean_string.remove_stopwords(
                clean_string.clean(headline).split(" ")):
            if headline_token in clean_string.clean(body):
                bin_count += 1
                bin_count_early += 1
        return [bin_count, bin_count_early]

    def count_grams(headline, body):
        # Count how many times an n-gram of the title
        # appears in the entire body, and intro paragraph

        clean_body = clean_string.clean(body)
        clean_headline = clean_string.clean(headline)
        features = []
        features = append_chargrams(features, clean_headline, clean_body, 2)
        features = append_chargrams(features, clean_headline, clean_body, 8)
        features = append_chargrams(features, clean_headline, clean_body, 4)
        features = append_chargrams(features, clean_headline, clean_body, 16)
        features = append_ngrams(features, clean_headline, clean_body, 2)
        features = append_ngrams(features, clean_headline, clean_body, 3)
        features = append_ngrams(features, clean_headline, clean_body, 4)
        features = append_ngrams(features, clean_headline, clean_body, 5)
        features = append_ngrams(features, clean_headline, clean_body, 6)
        return features

    feature = []
    for headline, body in zip(headlines, bodies):
        clean_headline = clean_string.clean(headline)
        clean_body = clean_string.clean(body)

        grams_feat = binary_co_occurence(
            headline, body) + binary_co_occurence_stops(
                headline, body) + count_grams(headline, body)
        polarity_h_feat = [calculate_polarity(clean_headline)]
        polarity_b_feat = [calculate_polarity(clean_body)]

        clean_headline = clean_string.get_tokenized_lemmas(clean_headline)
        clean_body = clean_string.get_tokenized_lemmas(clean_body)

        refute_feat = [
            1 if word in clean_headline else 0 for word in _refuting_words
        ]
        overlapping_feat = [
            len(set(clean_headline).intersection(clean_body)) /
            float(len(set(clean_headline).union(clean_body)))
        ]

        temp = grams_feat + refute_feat + polarity_h_feat + polarity_b_feat + overlapping_feat
        feature.append(temp)
    np.save(directory_path + name + count_out, feature)

    return np.asarray(feature)