Esempio n. 1
0
def process_mask_chyrons(domain_name,
                         version_description="",
                         num_to_process=0):
    chyron_data = []
    chyrons_file = "./mask_dist_chyrons.txt"
    text_number = 0
    with open(chyrons_file, 'r') as chyron_file:
        progress_count = 0
        chyrons = chyron_file.read().splitlines()

        #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults
        # to processing the entirety of the file
        if num_to_process == 0:
            num_to_process = len(tweets)

        for chyron_text in chyrons[:num_to_process]:
            chyron = json.loads(chyron_text)
            #right single quote which is not typical and will cause issues when looking
            # up items like n't in the lexicons
            chyron_data.append([
                chyron["chyron_text"].replace("’", "'"), chyron["author"],
                chyron["timestamp"], chyron["doc_id"]
            ])

    (output,
     text_number) = stance_detection.stances(tweet_full_texts, text_number,
                                             domain_configs[domain_name])

    Path("./chyron_mask_stances_output").mkdir(exist_ok=True)

    #Add underscore to front of version description if one exist so that the
    # file name will be nicely underscore separated
    if version_description:
        version_description = "_" + version_description

    with open(
            "./chyron_mask_stances_output/mask_chyron_stances" +
            version_description + ".jsonl", "w+") as mask_chyron_stances:
        for stance in output["stances"]:
            mask_chyron_stances.write(json.dumps(stance))
            mask_chyron_stances.write("\n")

    return output
Esempio n. 2
0
def process_mask_tweets(domain_name, version_description="", num_to_process=0):
    tweet_full_texts = []
    tweets_file = "./mask_lines.txt"

    text_number = 0
    with open(tweets_file, 'r') as tweet_file:
        progress_count = 0
        tweets = tweet_file.read().splitlines()

        #NOTE User passes in amount of tweets to process, if the amount is 0 then it defaults
        # to processing the entirety of the tweets
        if num_to_process == 0:
            num_to_process = len(tweets)

        for tweet_text in tweets[:num_to_process]:
            tweet = json.loads(tweet_text)

            #Json.loads seems to automatically convert unicode characters like \u2019 which is a right single
            # quote, which is not typical and will cause issues when looking up items like n't in the lexicons
            tweet_full_texts.append([
                tweet["full_text"].replace("’", "'"), tweet["user"]["id"],
                tweet["created_at"], tweet["id"]
            ])

    (output,
     text_number) = stance_detection.stances(tweet_full_texts, text_number,
                                             domain_configs[domain_name])

    Path("./mask_stances_output").mkdir(exist_ok=True)

    #Add underscore to front of version description if one exist so that the
    # file name will be nicely underscore separated
    if version_description:
        version_description = "_" + version_description

    with open(
            "./mask_stances_output/mask_lines_stances" + version_description +
            ".txt", "w+") as mask_stances:
        for stance in output["stances"]:
            mask_stances.write(json.dumps(stance))
            mask_stances.write("\n")
    return output
Esempio n. 3
0
def extract_relevant_text(domain_name):
    stemmer = PorterStemmer()
    relevant_text = []
    df = list(
        pd.read_csv('./2020-04-01-tweets.tsv',
                    sep='\\t',
                    engine='python',
                    header=None)[4])

    def has_topic(passage, topics):
        tokens = word_tokenize(passage)  # tokenize before lowering
        stemmed_tokens = [stemmer.stem(token).lower() for token in tokens]
        if all([x in stemmed_tokens for x in topics]):
            relevant_text.append(passage)

    for text in df:
        has_topic(text, ['mask'])

    return stance_detection.stances(relevant_text, 0,
                                    domain_configs[domain_name])[0]
Esempio n. 4
0
def text_to_stances(domain_name,
                    txt_file_path,
                    version_description="",
                    num_to_process=0):
    data = []
    texts_file = txt_file_path
    text_number = 0
    with open(texts_file, 'r', encoding="utf-8") as text_file:
        progress_count = 0
        texts = text_file.read().splitlines()

        #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults
        # to processing the entirety of the file
        if num_to_process == 0:
            num_to_process = len(texts)

        for text in texts[:num_to_process]:
            #right single quote which is not typical and will cause issues when looking
            # up items like n't in the lexicons
            data.append(
                [text.replace("’", "'").replace("\u2019", "'"), "", "", ""])

    (output,
     text_number) = stance_detection.stances(data, text_number,
                                             domain_configs[domain_name])
    Path("./user_provided_stance_output").mkdir(exist_ok=True)

    #Add underscore to front of version description if one exist so that the
    # file name will be nicely underscore separated
    if version_description:
        version_description = "_" + version_description

    with open(
            "./user_provided_stance_output/user_provided_text_stances" +
            version_description + ".jsonl", "w+") as user_text_stances:
        for stance in output["stances"]:
            user_text_stances.write(json.dumps(stance))
            user_text_stances.write("\n")

    return output
Esempio n. 5
0
def csv_to_stances(domain_name,
                   csv_file_path,
                   text_label,
                   author_label,
                   timestamp_label,
                   doc_id_label,
                   version_description="",
                   num_to_process=0):
    num_processed = 0

    df = pd.read_csv(csv_file_path)

    #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults
    # to processing the entirety of the file
    if num_to_process == 0:
        num_to_process = len(df.index)

    #This is to decide how many chunks to split the data into to process batches of the specified
    # number. I will need to probably do the number as a variable that can be passed in by the uer
    num_chunks = math.floor(len(df.index) / 10)
    if num_chunks == 0:
        num_chunks = 1

    Path("./user_provided_stance_output").mkdir(exist_ok=True)

    #Add underscore to front of version description if one exist so that the
    # file name will be nicely underscore separated
    if version_description:
        version_description = "_" + version_description

    total_output = []
    text_number = 0
    with open(
            "./user_provided_stance_output/user_provided_csv_stances" +
            version_description + ".jsonl", "w+") as user_json_stances:
        for chunk in np.array_split(df, num_chunks):
            data = []
            chunk_output = {"stances": []}
            for row in chunk.iterrows():
                timestamp_data = ''
                #This step is necessary cause each row of the dataframe is a tuple (index, row info)
                row_info = row[1]
                num_processed += 1

                if num_processed > num_to_process:
                    break

                labels = timestamp_label.split("|")

                for label in labels:
                    timestamp_data += row_info[label] + " "

                #\u2019 is unicode for right single quote which is not typical, and will cause issues when looking
                # up items like n't in the lexicons
                data.append([
                    row_info[text_label].replace("’",
                                                 "'").replace("\u2019", "'"),
                    row_info[author_label], timestamp_data,
                    row_info[doc_id_label]
                ])

            (chunk_output, text_number) = stance_detection.stances(
                data, text_number, domain_configs[domain_name])
            total_output.extend(chunk_output["stances"])

            for stance in chunk_output["stances"]:
                user_json_stances.write(json.dumps(stance))
                user_json_stances.write("\n")

            print("Batch finished")

    #for row in df.iterrows():
    #	timestamp_data = ''
    #	#This step is necessary cause each row of the dataframe is a tuple (index, row info)
    #	row_info = row[1]
    #	num_processed += 1
    #	if num_processed > num_to_process:
    #		break

    #	labels = timestamp_label.split("|")

    #	for label in labels:
    #		timestamp_data += row_info[label] + " "

    #	#\u2019 is unicode for right single quote which is not typical, and will cause issues when looking
    #	# up items like n't in the lexicons
    #	data.append([row_info[text_label].replace("’", "'").replace("\u2019", "'"), row_info[author_label],
    #							timestamp_data, row_info[doc_id_label]])

    #output = stance_detection.stances(data)
    #Path("./user_provided_stance_output").mkdir(exist_ok=True)

    ##Add underscore to front of version description if one exist so that the
    ## file name will be nicely underscore separated
    #if version_description:
    #	version_description = "_" + version_description

    #with open("./user_provided_stance_output/user_provided_json_stances" + version_description + ".jsonl", "w+") as user_json_stances:
    #for stance in output["stances"]:
    #	user_json_stances.write(json.dumps(stance))
    #	user_json_stances.write("\n")

    return total_output
Esempio n. 6
0
def json_to_stances(domain_name,
                    json_file_path,
                    text_attrb_name,
                    author_attrb_name,
                    timestamp_attrb_name,
                    doc_id_attrb_name,
                    version_description="",
                    num_to_process=0):
    num_processed = 0

    Path("./user_provided_stance_output").mkdir(exist_ok=True)

    with open(json_file_path, 'r') as json_file:
        jsons = json_file.read().splitlines()

        items_per_chunk = 10

        #NOTE User passes in amount of chyrons to process, if the amount is 0 then it defaults
        # to processing the entirety of the file
        if num_to_process == 0:
            num_to_process = len(jsons)

        #Add underscore to front of version description if one exist so that the
        # file name will be nicely underscore separated
        if version_description:
            version_description = "_" + version_description

        total_output = []
        text_number = 0
        with open(
                "./user_provided_stance_output/user_provided_json_stances" +
                version_description + ".jsonl", "w+") as user_json_stances:

            #for line_json in jsons[:num_to_process]:

            for i in range(0, len(jsons), items_per_chunk):
                data = []
                chunk_output = {"stances": []}

                for line_json in jsons[i:i + items_per_chunk]:
                    line = json.loads(line_json)

                    nested_text_attrbs = text_attrb_name.split(",")
                    nested_author_attrbs = author_attrb_name.split(",")
                    nested_timestamp_attrbs = timestamp_attrb_name.split(",")
                    nested_doc_id_attrbs = doc_id_attrb_name.split(",")

                    text = handle_nested_json(nested_text_attrbs, line)
                    author = handle_nested_json(nested_author_attrbs, line)
                    timestamp = handle_nested_json(nested_timestamp_attrbs,
                                                   line)
                    doc_id = handle_nested_json(nested_doc_id_attrbs, line)

                    num_processed += 1

                    if num_processed > num_to_process:
                        break

                    #\u2019 is unicode for right single quote which is not typical, and will cause issues when looking
                    # up items like n't in the lexicons
                    #data.append([line[text_attrb_name].replace("’", "'"), line[author_attrb_name],
                    #					line[timestamp_attrb_name], line[doc_id_attrb_name]])
                    data.append(
                        [text.replace("’", "'"), author, timestamp, doc_id])

                (chunk_output, text_number) = stance_detection.stances(
                    data, text_number, domain_configs[domain_name])
                total_output.extend(chunk_output["stances"])

                for stance in chunk_output["stances"]:
                    user_json_stances.write(json.dumps(stance))
                    user_json_stances.write("\n")

                print("Batch finished")

    return total_output
Esempio n. 7
0
def test(text, domain_name):

    return stance_detection.stances([[text.replace("’", "'"), '', '', '', '']],
                                    0, domain_configs[domain_name])[0]