def prepare_processed_training_data():
    """
    Generate all features into processes/... folder from interim/...
    """
    logger = logging.getLogger(__name__)
    logger.info('Making processed training data set from interim data')

    # Init absolute path of folders
    processed_folder_path = os.path.join(DATA_PROCESSED_ROOT, DATASET_NAME)
    interim_folder_path = os.path.join(DATA_INTERIM_ROOT, DATASET_NAME)

    if os.path.exists(processed_folder_path):
        shutil.rmtree(processed_folder_path)
    os.makedirs(processed_folder_path)

    for event_name in DATASET_EVENTS:

        event_folder_path = os.path.join(interim_folder_path, event_name)
        list_tweet_ids = [
            name for name in os.listdir(event_folder_path)
            if os.path.isfile(os.path.join(event_folder_path, name))
        ]

        processed_event_folder_path = os.path.join(processed_folder_path,
                                                   event_name)
        os.makedirs(processed_event_folder_path)

        train_processed_file = open(
            os.path.join(processed_event_folder_path, 'train.txt'), "w")
        train_processed_label_file = open(
            os.path.join(processed_event_folder_path, 'train_label.txt'), "w")

        tweet_count = len(list_tweet_ids)

        for index, id in enumerate(list_tweet_ids):
            print event_name, '+', index
            source_tweet = json_from_file(os.path.join(event_folder_path, id))
            features = collect_feature(source_tweet)
            features_str = "\t".join([str(i) for i in features])
            train_processed_file.write(features_str)
            if index != tweet_count - 1:
                train_processed_file.write('\n')
            train_processed_label_file.write(
                str(VERACITY_LABELS_MAPPING[source_tweet['veracity']]))
            if index != tweet_count - 1:
                train_processed_label_file.write('\n')

        train_processed_file.close()
        train_processed_label_file.close()
import json
from kafka import KafkaConsumer
import pymongo
from bson import ObjectId
from email.mime.text import MIMEText
from email.header import Header
from subprocess import Popen, PIPE

from utils import json_from_file

COMMASPACE = ', '
config_file_name = 'config.json'
config = {}

try:
    config = json_from_file(config_file_name, "Can't open ss-config file.")
except RuntimeError as e:
    print(e)
    exit()

formatter = logging.Formatter(config['logging.format'])
# Create handlers
c_handler = logging.StreamHandler()
f_handler = logging.FileHandler(config['logging.file'])

# Create formatters and add it to handlers
c_handler.setFormatter(formatter)
f_handler.setFormatter(formatter)

logging_level = config["logging.level"] if 'logging.level' in config else 20
print("Selecting logging level", logging_level)
Ejemplo n.º 3
0
def prepare_test_data():
    """ Runs data processing scripts to turn testing raw data from (../raw) into
            interim data to be analyzed (saved in ../interim).
    """

    logger = logging.getLogger(__name__)
    logger.info('Making interim test data set from raw data')

    # Init absolute path of folders
    raw_input_folder_path = os.path.join(DATA_RAW_ROOT, TESTSET_NAME)
    raw_output_folder_path = os.path.join(DATA_RAW_ROOT, TESTSET_NAME)
    interim_folder_path = os.path.join(DATA_INTERIM_ROOT, TESTSET_NAME)

    # Read veracities from both test and dev files
    veracity_labels = json_from_file(os.path.join(raw_output_folder_path, VERACITY_LABEL_TEST_FILE[0]))

    # Read stances from both test and dev files
    stance_labels = json_from_file(os.path.join(raw_output_folder_path, STANCE_LABEL_TEST_FILE[0]))


    # If interim data existed, delete and create a new one
    if os.path.exists(interim_folder_path):
        shutil.rmtree(interim_folder_path)
    os.makedirs(interim_folder_path)

    list_tweet_ids = [name for name in os.listdir(raw_input_folder_path) if
                      os.path.isdir(os.path.join(raw_input_folder_path, name))]

    for index, id in enumerate(list_tweet_ids):

        # thread conversation folder in raw
        source_tweet_folder_path = os.path.join(raw_input_folder_path, id)

        # read source tweet
        source_tweet_file = open(os.path.join(source_tweet_folder_path, 'source-tweet', id + '.json'), 'r')
        source_tweet_content = source_tweet_file.read()
        source_tweet_file.close()
        source_tweet = json.loads(source_tweet_content)
        source_tweet_replies = []

        # read replies
        replies_folder_path = os.path.join(source_tweet_folder_path, 'replies')
        list_reply_ids = [name for name in os.listdir(replies_folder_path) if
                          os.path.isfile(os.path.join(replies_folder_path, name))]
        for reply_id in list_reply_ids:
            reply_file = open(os.path.join(replies_folder_path, reply_id), "r")
            reply_content = reply_file.read()
            reply_file.close()
            reply = json.loads(reply_content)
            reply['stance'] = stance_labels[reply['id_str']]
            source_tweet_replies.append(reply)

        source_tweet['replies'] = source_tweet_replies

        # read structure
        structure_file = open(os.path.join(source_tweet_folder_path, 'structure.json'), "r")
        structure_content = structure_file.read()
        structure_file.close()
        structure = json.loads(structure_content)
        source_tweet['structure'] = structure

        source_tweet['veracity'] = veracity_labels[source_tweet['id_str']]

        source_tweet['stance'] = stance_labels[source_tweet['id_str']]

        # create tweet file in interim to write
        interim_tweet_file = open(os.path.join(interim_folder_path, str(index) + '.json'), "w")

        # write tweet to interim
        interim_tweet_file.write(json.dumps(source_tweet, indent=4))
        interim_tweet_file.close()
Ejemplo n.º 4
0
import jq

from utils import (BASE, create_comment, delete_comments, json_from_file,
                   request, validate_file)

json_schema = os.getenv('INPUT_JSON_SCHEMA')
json_path_pattern = os.getenv('INPUT_JSON_PATH_PATTERN')
send_comment = strtobool(os.getenv('INPUT_SEND_COMMENT'))
clear_comments = strtobool(os.getenv('INPUT_CLEAR_COMMENTS'))

event_path = os.getenv('GITHUB_EVENT_PATH')
repo = os.getenv('GITHUB_REPOSITORY')

PR_FILES = BASE + '/repos/{repo}/pulls/{pull_number}/files'

event = json_from_file(event_path)
pull_number = jq.compile('.pull_request.number').input(event).first()

errors = []
pr_files_url = PR_FILES.format(repo=repo, pull_number=pull_number)
pr_files = request('get', pr_files_url)

for pr_file in pr_files:
    filename = pr_file['filename']
    validation_errors = validate_file(json_schema, json_path_pattern, filename)

    if len(validation_errors):
        errors.append({'path': filename, 'errors': validation_errors})

if clear_comments:
    delete_comments(repo, pull_number)