Example #1
0
def raw_to_per_day(raw_path):
    """
        extract user-specifit interaction data for each file in parallel
        :raw_path: path of data file
        :return: none
    """
    global out_dir, dict_url2id

    write_log('Processing : {}'.format(raw_path))

    with open(raw_path, 'r') as f_raw:
        lines = f_raw.readlines()

    dict_per_user = {}
    list_per_time = []

    total_count = len(lines)
    count = 0

    for line in lines:
        if count % 10000 == 0:
            write_log('Processing({}) : {}/{}'.format(raw_path, count,
                                                      total_count))
        count += 1

        line = line.strip()
        line_json = json.loads(line)

        user_id = line_json.get('userId', None)
        url = find_best_url(event_dict=line_json)
        time = line_json.get('time', -1)
        article_id = line_json.get('id', None)

        if (user_id == None) or (url == None) or (time < 0) or (article_id
                                                                == None):
            continue

        if dict_per_user.get(user_id, None) == None:
            dict_per_user[user_id] = []

        dict_per_user[user_id].append(tuple((time, url)))
        list_per_time.append(tuple((time, user_id, url)))

        dict_url2id[url] = article_id

    lines = None

    per_user_path = out_dir + '/per_user/' + os.path.basename(raw_path)
    per_time_path = out_dir + '/per_time/' + os.path.basename(raw_path)

    with open(per_user_path, 'w') as f_user:
        json.dump(dict_per_user, f_user)

    with open(per_time_path, 'w') as f_time:
        json.dump(list_per_time, f_time)

    dict_per_user = None
    list_per_time = None

    write_log('Done : {}'.format(raw_path))
def extract_article_content(content_dir):

    target_files = []

    for file_name in os.listdir(content_dir):
        file_path = os.path.join(content_dir, file_name)

        if not os.path.isfile(file_path):
            continue

        target_files.append(file_path)

    output = {}
    for file_idx, file_path in enumerate(target_files):
        lines = []
        with open(file_path, 'r') as f_con:
            lines = [line.strip() for line in f_con.readlines() if len(line.strip()) > 0]

        for line in lines:
            try:
                dict_cont = json.loads(line)
            except:
                print('Error: {}'.format(line))
                continue

            dict_data = {}

            for field in dict_cont.get('fields', []):
                field_name = field.get('field', None)
                field_value = field.get('value', None)

                if not field_name or not field_value:
                    continue

                if field_name not in ['url', 'cannonicalUrl', 'referrerUrl', 
                        'title', 'body',
                        'category0', 'category1']:
                    continue

                dict_data[field_name] = field_value

            # find the best URL
            best_url = find_best_url(dict_data)
            if not best_url:
                continue

            for key in ['url', 'cannonicalUrl', 'referrerUrl']:
                dict_data.pop(key, None)

            # preprocess title & body
            if ('title' not in dict_data) or ('body' not in dict_data):
                continue

            def preprocess_sentence(sentences):
                new_sentences = []
                regex_remove = re.compile('[\'|\"|,|\-|\\.| |\?|«|»|:|!|–|@|\\(|\\)|−]+')
                for sentence in sentences:
                    sentence = re.sub(regex_remove, ' ', sentence)
                    new_sentences.append(sentence.strip())
                return new_sentences

            dict_data['sentence_header'] = preprocess_sentence([dict_data['title']])
            dict_data['sentence_body'] = preprocess_sentence(dict_data['body'])

            for key in ['title', 'body']:
                dict_data.pop(key, None)

            output[best_url] = dict_data

    write_log('Save to Json : start')
    with open(out_dir, 'w') as f_json:
        json.dump(output, f_json)
    write_log('Save to Json : end')