Example #1
0
import os
import logging
import json
#####
import logging_factory
#####
logger_err = logging_factory.get_module_logger("file_manager_err",
                                               logging.ERROR)
logger = logging_factory.get_module_logger("file_manager", logging.DEBUG)


def count_lines_file(path: str):
    """
    Function that returns the number of lines of a file given its path

    :param path: str - the path to the file
    :return: int - the number of lines of the file
    """

    with open(path) as f:
        for i, l in enumerate(f):
            pass
    return i + 1


def clear_file(save_path: str):
    """
    Given a path to a file, clears the contents of the file

    :param save_path: str - path to the file
    """
Example #2
0
import logging
import json
import os
#####
import logging_factory
#####
from typing import Optional
#####
logger_err = logging_factory.get_module_logger("tools_err", logging.ERROR)
logger = logging_factory.get_module_logger("tools", logging.DEBUG)


def obtain_usernames(subr_path: str):
    """
    Given the path of the backup, generates one .txt file containing the authors in the backup

    :param subr_path: str - path to the file (i.e subreddit file)
    """

    subr_authors = set()

    try:
        with open(subr_path, "r") as input_file:
            for line in input_file:
                try:
                    loaded = json.loads(line)
                    author = loaded["author"]
                    if author != "[deleted]":
                        subr_authors.add(author)
                except KeyError:
                    logger_err.error(
Example #3
0
import json
import logging
import pandas as pd
#####
import logging_factory
import indexer
#####
from elasticsearch import Elasticsearch, ConnectionTimeout, TransportError, ConnectionError
from elasticsearch_dsl import Search, Q
#####
logger_err = logging_factory.get_module_logger("questioner_err", logging.ERROR)
logger = logging_factory.get_module_logger("questioner", logging.DEBUG)


def extract_authors_info(authors_path: str):
    """
    Given a .txt file containing the names of the authors, searches in an Elasticsearch index their corresponding
    information (for reddit: account identifier, username, date of creation, date of retrieval, comment and
    link karma punctuation). Generates a .jsonl file containing all the authors info sorted by their account id.

    :param authors_path: str - path to the .txt file containing the authors
    """

    import math

    host, port = "localhost", 9200
    es = Elasticsearch(hosts=[{"host": host, "port": port}])
    search = Search(using=es, index="reddit_users")
    max_query_size = 50000

    authors = []
Example #4
0
import logging
import gzip
import json
#####
import logging_factory
#####
from elasticsearch import Elasticsearch, helpers, ConnectionTimeout, ConnectionError
from elasticsearch.helpers import BulkIndexError
#####
logger_err = logging_factory.get_module_logger("indexer_err", logging.ERROR)
logger = logging_factory.get_module_logger("indexer", logging.DEBUG)


def decode_file(file_handler, is_csv: bool):
    """
    Given a file handler (for .csv and .jsonl) formats all the info and returns an index and a dictionary with the
    required data

    :param file_handler: the file handler containing the lines to be processed
    :param is_csv: bool - True if the file handler is for .csv files, False if it's for .jsonl files
    :return: yielded index (str) and a dictionary containing the account identifier, the username, the date of
    creation of the account, the date of retrieval and the comment and link karma punctuations
    """

    es_fields_keys = ("acc_id", "username", "created", "updated",
                      "comment_karma", "link_karma")

    # If it's a .csv file, skip the header
    if is_csv:
        try:
            next(file_handler)