def binary_co_occurence(headline, body): # Count how many times a token in the title # appears in the body text. bin_count = 0 bin_count_early = 0 for headline_token in clean_string.clean(headline).split(" "): if headline_token in clean_string.clean(body): bin_count += 1 if headline_token in clean_string.clean(body)[:255]: bin_count_early += 1 return [bin_count, bin_count_early]
def binary_co_occurence_stops(headline, body): # Count how many times a token in the title # appears in the body text. Stopwords in the title # are ignored. bin_count = 0 bin_count_early = 0 for headline_token in clean_string.remove_stopwords( clean_string.clean(headline).split(" ")): if headline_token in clean_string.clean(body): bin_count += 1 bin_count_early += 1 return [bin_count, bin_count_early]
def count_grams(headline, body): # Count how many times an n-gram of the title # appears in the entire body, and intro paragraph clean_body = clean_string.clean(body) clean_headline = clean_string.clean(headline) features = [] features = append_chargrams(features, clean_headline, clean_body, 2) features = append_chargrams(features, clean_headline, clean_body, 8) features = append_chargrams(features, clean_headline, clean_body, 4) features = append_chargrams(features, clean_headline, clean_body, 16) features = append_ngrams(features, clean_headline, clean_body, 2) features = append_ngrams(features, clean_headline, clean_body, 3) features = append_ngrams(features, clean_headline, clean_body, 4) features = append_ngrams(features, clean_headline, clean_body, 5) features = append_ngrams(features, clean_headline, clean_body, 6) return features
def clean(file_name): docs = [] line_count = 0 stop_words = set(nltk.corpus.stopwords.words('english')) stemmer = nltk.stem.PorterStemmer() with open(file_name, encoding='utf8', errors='replace') as csv_file: csv_reader = csv.reader(csv_file) for row in csv_reader: if (line_count != 0): row_content = clean_string.clean(row[1], stemmer, stop_words) docs.append([row[0], row_content]) else: docs.append([row[0], row[1]]) line_count += 1 print("Writing to new file...") with open('cleaned_file.csv', mode='w', encoding='utf8', newline='') as cleaned_file: news_writer = csv.writer(cleaned_file, delimiter=',') for entry in docs: news_writer.writerow([entry[0], entry[1]])
input_complaint_list.append( speech_part(bucket_name, file_name + "-" + str(i + 1) + ".mp3", path)) #---------------------------------------------------------------------------------------------------------- #Summarization, Cleaning and Classifier Input Generation Part path = "./" clean_complaint_list = [] summarized_complaint_list = [] #Summarizes the complaints and cleans it for using the trained model on it for i in range(NoT): summarized_complaint = textsummarization.summarize(input_complaint_list[i]) clean_complaint = clean_string.clean(summarized_complaint) summarized_complaint_list.append(summarized_complaint) clean_complaint_list.append(clean_complaint) #Writes the cleaned complaints to a .csv file with open('./test_complaint.csv', 'w') as f: for i in range(NoT): f.write(clean_complaint_list[i] + '\n') bucket_name = "complaints-input" file_name = "test_complaint.csv" #Uploads the complaint (.csv) file to S3 Bucket upload_s3.upload(path, bucket_name, file_name) time.sleep(20)
def count_feature_generate(name, headlines, bodies): """ generate count feature by given dataset name: name of data set, ex) train, competition ... headlines: array contains titles12 bodies: array contains bodies +) heads and bodies most be paired. """ if len(headlines) != len(bodies): print( "Check headlines size and bodies size for count_feature_generate") sys.exit(1) def calculate_polarity(text): tokens = clean_string.get_tokenized_lemmas(text) return sum([t in _refuting_words for t in tokens]) % 2 def binary_co_occurence(headline, body): # Count how many times a token in the title # appears in the body text. bin_count = 0 bin_count_early = 0 for headline_token in clean_string.clean(headline).split(" "): if headline_token in clean_string.clean(body): bin_count += 1 if headline_token in clean_string.clean(body)[:255]: bin_count_early += 1 return [bin_count, bin_count_early] def binary_co_occurence_stops(headline, body): # Count how many times a token in the title # appears in the body text. Stopwords in the title # are ignored. bin_count = 0 bin_count_early = 0 for headline_token in clean_string.remove_stopwords( clean_string.clean(headline).split(" ")): if headline_token in clean_string.clean(body): bin_count += 1 bin_count_early += 1 return [bin_count, bin_count_early] def count_grams(headline, body): # Count how many times an n-gram of the title # appears in the entire body, and intro paragraph clean_body = clean_string.clean(body) clean_headline = clean_string.clean(headline) features = [] features = append_chargrams(features, clean_headline, clean_body, 2) features = append_chargrams(features, clean_headline, clean_body, 8) features = append_chargrams(features, clean_headline, clean_body, 4) features = append_chargrams(features, clean_headline, clean_body, 16) features = append_ngrams(features, clean_headline, clean_body, 2) features = append_ngrams(features, clean_headline, clean_body, 3) features = append_ngrams(features, clean_headline, clean_body, 4) features = append_ngrams(features, clean_headline, clean_body, 5) features = append_ngrams(features, clean_headline, clean_body, 6) return features feature = [] for headline, body in zip(headlines, bodies): clean_headline = clean_string.clean(headline) clean_body = clean_string.clean(body) grams_feat = binary_co_occurence( headline, body) + binary_co_occurence_stops( headline, body) + count_grams(headline, body) polarity_h_feat = [calculate_polarity(clean_headline)] polarity_b_feat = [calculate_polarity(clean_body)] clean_headline = clean_string.get_tokenized_lemmas(clean_headline) clean_body = clean_string.get_tokenized_lemmas(clean_body) refute_feat = [ 1 if word in clean_headline else 0 for word in _refuting_words ] overlapping_feat = [ len(set(clean_headline).intersection(clean_body)) / float(len(set(clean_headline).union(clean_body))) ] temp = grams_feat + refute_feat + polarity_h_feat + polarity_b_feat + overlapping_feat feature.append(temp) np.save(directory_path + name + count_out, feature) return np.asarray(feature)