def process_mask_chyrons(domain_name, version_description="", num_to_process=0): chyron_data = [] chyrons_file = "./mask_dist_chyrons.txt" text_number = 0 with open(chyrons_file, 'r') as chyron_file: progress_count = 0 chyrons = chyron_file.read().splitlines() #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults # to processing the entirety of the file if num_to_process == 0: num_to_process = len(tweets) for chyron_text in chyrons[:num_to_process]: chyron = json.loads(chyron_text) #right single quote which is not typical and will cause issues when looking # up items like n't in the lexicons chyron_data.append([ chyron["chyron_text"].replace("’", "'"), chyron["author"], chyron["timestamp"], chyron["doc_id"] ]) (output, text_number) = stance_detection.stances(tweet_full_texts, text_number, domain_configs[domain_name]) Path("./chyron_mask_stances_output").mkdir(exist_ok=True) #Add underscore to front of version description if one exist so that the # file name will be nicely underscore separated if version_description: version_description = "_" + version_description with open( "./chyron_mask_stances_output/mask_chyron_stances" + version_description + ".jsonl", "w+") as mask_chyron_stances: for stance in output["stances"]: mask_chyron_stances.write(json.dumps(stance)) mask_chyron_stances.write("\n") return output
def process_mask_tweets(domain_name, version_description="", num_to_process=0): tweet_full_texts = [] tweets_file = "./mask_lines.txt" text_number = 0 with open(tweets_file, 'r') as tweet_file: progress_count = 0 tweets = tweet_file.read().splitlines() #NOTE User passes in amount of tweets to process, if the amount is 0 then it defaults # to processing the entirety of the tweets if num_to_process == 0: num_to_process = len(tweets) for tweet_text in tweets[:num_to_process]: tweet = json.loads(tweet_text) #Json.loads seems to automatically convert unicode characters like \u2019 which is a right single # quote, which is not typical and will cause issues when looking up items like n't in the lexicons tweet_full_texts.append([ tweet["full_text"].replace("’", "'"), tweet["user"]["id"], tweet["created_at"], tweet["id"] ]) (output, text_number) = stance_detection.stances(tweet_full_texts, text_number, domain_configs[domain_name]) Path("./mask_stances_output").mkdir(exist_ok=True) #Add underscore to front of version description if one exist so that the # file name will be nicely underscore separated if version_description: version_description = "_" + version_description with open( "./mask_stances_output/mask_lines_stances" + version_description + ".txt", "w+") as mask_stances: for stance in output["stances"]: mask_stances.write(json.dumps(stance)) mask_stances.write("\n") return output
def extract_relevant_text(domain_name): stemmer = PorterStemmer() relevant_text = [] df = list( pd.read_csv('./2020-04-01-tweets.tsv', sep='\\t', engine='python', header=None)[4]) def has_topic(passage, topics): tokens = word_tokenize(passage) # tokenize before lowering stemmed_tokens = [stemmer.stem(token).lower() for token in tokens] if all([x in stemmed_tokens for x in topics]): relevant_text.append(passage) for text in df: has_topic(text, ['mask']) return stance_detection.stances(relevant_text, 0, domain_configs[domain_name])[0]
def text_to_stances(domain_name, txt_file_path, version_description="", num_to_process=0): data = [] texts_file = txt_file_path text_number = 0 with open(texts_file, 'r', encoding="utf-8") as text_file: progress_count = 0 texts = text_file.read().splitlines() #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults # to processing the entirety of the file if num_to_process == 0: num_to_process = len(texts) for text in texts[:num_to_process]: #right single quote which is not typical and will cause issues when looking # up items like n't in the lexicons data.append( [text.replace("’", "'").replace("\u2019", "'"), "", "", ""]) (output, text_number) = stance_detection.stances(data, text_number, domain_configs[domain_name]) Path("./user_provided_stance_output").mkdir(exist_ok=True) #Add underscore to front of version description if one exist so that the # file name will be nicely underscore separated if version_description: version_description = "_" + version_description with open( "./user_provided_stance_output/user_provided_text_stances" + version_description + ".jsonl", "w+") as user_text_stances: for stance in output["stances"]: user_text_stances.write(json.dumps(stance)) user_text_stances.write("\n") return output
def csv_to_stances(domain_name, csv_file_path, text_label, author_label, timestamp_label, doc_id_label, version_description="", num_to_process=0): num_processed = 0 df = pd.read_csv(csv_file_path) #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults # to processing the entirety of the file if num_to_process == 0: num_to_process = len(df.index) #This is to decide how many chunks to split the data into to process batches of the specified # number. I will need to probably do the number as a variable that can be passed in by the uer num_chunks = math.floor(len(df.index) / 10) if num_chunks == 0: num_chunks = 1 Path("./user_provided_stance_output").mkdir(exist_ok=True) #Add underscore to front of version description if one exist so that the # file name will be nicely underscore separated if version_description: version_description = "_" + version_description total_output = [] text_number = 0 with open( "./user_provided_stance_output/user_provided_csv_stances" + version_description + ".jsonl", "w+") as user_json_stances: for chunk in np.array_split(df, num_chunks): data = [] chunk_output = {"stances": []} for row in chunk.iterrows(): timestamp_data = '' #This step is necessary cause each row of the dataframe is a tuple (index, row info) row_info = row[1] num_processed += 1 if num_processed > num_to_process: break labels = timestamp_label.split("|") for label in labels: timestamp_data += row_info[label] + " " #\u2019 is unicode for right single quote which is not typical, and will cause issues when looking # up items like n't in the lexicons data.append([ row_info[text_label].replace("’", "'").replace("\u2019", "'"), row_info[author_label], timestamp_data, row_info[doc_id_label] ]) (chunk_output, text_number) = stance_detection.stances( data, text_number, domain_configs[domain_name]) total_output.extend(chunk_output["stances"]) for stance in chunk_output["stances"]: user_json_stances.write(json.dumps(stance)) user_json_stances.write("\n") print("Batch finished") #for row in df.iterrows(): # timestamp_data = '' # #This step is necessary cause each row of the dataframe is a tuple (index, row info) # row_info = row[1] # num_processed += 1 # if num_processed > num_to_process: # break # labels = timestamp_label.split("|") # for label in labels: # timestamp_data += row_info[label] + " " # #\u2019 is unicode for right single quote which is not typical, and will cause issues when looking # # up items like n't in the lexicons # data.append([row_info[text_label].replace("’", "'").replace("\u2019", "'"), row_info[author_label], # timestamp_data, row_info[doc_id_label]]) #output = stance_detection.stances(data) #Path("./user_provided_stance_output").mkdir(exist_ok=True) ##Add underscore to front of version description if one exist so that the ## file name will be nicely underscore separated #if version_description: # version_description = "_" + version_description #with open("./user_provided_stance_output/user_provided_json_stances" + version_description + ".jsonl", "w+") as user_json_stances: #for stance in output["stances"]: # user_json_stances.write(json.dumps(stance)) # user_json_stances.write("\n") return total_output
def json_to_stances(domain_name, json_file_path, text_attrb_name, author_attrb_name, timestamp_attrb_name, doc_id_attrb_name, version_description="", num_to_process=0): num_processed = 0 Path("./user_provided_stance_output").mkdir(exist_ok=True) with open(json_file_path, 'r') as json_file: jsons = json_file.read().splitlines() items_per_chunk = 10 #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults # to processing the entirety of the file if num_to_process == 0: num_to_process = len(jsons) #Add underscore to front of version description if one exist so that the # file name will be nicely underscore separated if version_description: version_description = "_" + version_description total_output = [] text_number = 0 with open( "./user_provided_stance_output/user_provided_json_stances" + version_description + ".jsonl", "w+") as user_json_stances: #for line_json in jsons[:num_to_process]: for i in range(0, len(jsons), items_per_chunk): data = [] chunk_output = {"stances": []} for line_json in jsons[i:i + items_per_chunk]: line = json.loads(line_json) nested_text_attrbs = text_attrb_name.split(",") nested_author_attrbs = author_attrb_name.split(",") nested_timestamp_attrbs = timestamp_attrb_name.split(",") nested_doc_id_attrbs = doc_id_attrb_name.split(",") text = handle_nested_json(nested_text_attrbs, line) author = handle_nested_json(nested_author_attrbs, line) timestamp = handle_nested_json(nested_timestamp_attrbs, line) doc_id = handle_nested_json(nested_doc_id_attrbs, line) num_processed += 1 if num_processed > num_to_process: break #\u2019 is unicode for right single quote which is not typical, and will cause issues when looking # up items like n't in the lexicons #data.append([line[text_attrb_name].replace("’", "'"), line[author_attrb_name], # line[timestamp_attrb_name], line[doc_id_attrb_name]]) data.append( [text.replace("’", "'"), author, timestamp, doc_id]) (chunk_output, text_number) = stance_detection.stances( data, text_number, domain_configs[domain_name]) total_output.extend(chunk_output["stances"]) for stance in chunk_output["stances"]: user_json_stances.write(json.dumps(stance)) user_json_stances.write("\n") print("Batch finished") return total_output
def test(text, domain_name): return stance_detection.stances([[text.replace("’", "'"), '', '', '', '']], 0, domain_configs[domain_name])[0]