Ejemplo n.º 1
0
    def __init__(self, configFile, mode='normal'):
        ct = ConfigTyped(configFile, mode)
        opts = ct.opts

        thisDir = os.path.dirname(os.path.abspath(__file__))
        idir = opts['pushshift_raw_dir']
        onlyfiles = sorted(
            [join(idir, f) for f in listdir(idir) if isfile(join(idir, f))],
            reverse=True,
            key=lambda x: os.path.basename(x)[3:])
        log('1-pushshift-slim')
        for input_file in onlyfiles:
            extension = os.path.splitext(input_file)[1].lower()
            basename = os.path.basename(re.sub(r'\.[^.]*$', '', input_file))
            upl = FilesLog(opts['unprocessable_files_log'])
            sl = FilesLog(opts['skippable_files_log'])
            if basename in (upl.read_entries() + sl.read_entries()):
                continue
            type = 'comments'
            if basename[1].lower() == 's':
                type = 'posts'
            if extension in ['.gz', '.xz', '.zst', '.bz2']:
                output_file = join(opts['pushshift_slim_dir'],
                                   basename + '.csv')
                if (not isfile(output_file)) or opts['force']:
                    log(basename)
                    pfp = PushshiftFileProcessor(input_file, output_file, type,
                                                 opts['max_lines_to_read'])
                    try:
                        pfp.transform()
                    except BadSubmissionData:
                        upl.add_entry(basename)
                        log(basename + ' marked as unprocessable')
                        continue
        log('finished')
Ejemplo n.º 2
0
 def __init__(self, configFile, mode='normal'):
     ct = ConfigTyped(configFile, mode)
     opts = ct.opts
     thisDir = os.path.dirname(os.path.abspath(__file__))
     idir = opts['aggregate_all_dir']
     types = {'RC_': 'comments', 'RS_': 'posts'}
     ## Order matters, must complete 'RC_' first
     for prefix in ['RC_', 'RS_']:
         type = types[prefix]
         file = join(idir, prefix + 'aggregate_all.csv')
         log('4-add-fields ' + type)
         af = AddFields(
             file, opts['add_fields_dir'], type,
             opts['add_fields_id_field'],
             list(
                 map(lambda x: x.strip(),
                     opts['extra_fields_' + type].split(','))))
         af.process()
     log('finished')
Ejemplo n.º 3
0
    def __init__(self, configFile, mode='normal'):
        ct = ConfigTyped(configFile, mode)
        opts = ct.opts

        thisDir = os.path.dirname(os.path.abspath(__file__))
        idir = opts['pushshift_slim_dir']
        onlyfiles = sorted([join(idir, f) for f in listdir(idir) if isfile(join(idir, f))], reverse=True,
                           key = lambda x: os.path.basename(x)[3:])
        log('2-aggregate-monthly')
        for input_file in onlyfiles:
            extension = os.path.splitext(input_file)[1].lower()
            basename = os.path.basename(re.sub(r'\.[^.]*$','',input_file))
            if extension == '.csv':
                output_file = join(opts['aggregate_dir'], basename+'.csv')
                if (not isfile(output_file)) or opts['force']:
                    log(basename)
                    am = AggregateMonthly(input_file, output_file, opts['aggregate_n_rows'], opts['dropna'])
                    am.aggregate()
                    am.write_csv()
        log('finished')
Ejemplo n.º 4
0
    def __init__(self, configFile, mode='normal'):
        ct = ConfigTyped(configFile, mode)
        opts = ct.opts
        thisDir = os.path.dirname(os.path.abspath(__file__))
        idir = opts['aggregate_dir']
        for prefix in ['RC_', 'RS_']:

            output_file = join(opts['aggregate_all_dir'],
                               prefix + 'aggregate_all.csv')
            files = list(filter(isfile, glob.glob(idir + f"{prefix}*.csv")))
            files.sort(key=lambda x: os.path.getmtime(x), reverse=True)
            mostRecentInputFileModificationTime = os.path.getmtime(files[0])
            if not (opts['force']) and isfile(
                    output_file) and os.path.getmtime(
                        output_file) > mostRecentInputFileModificationTime:
                log('skipping ' + prefix)
                continue
            log('3-aggregate-all ' + prefix)
            aa = AggregateAll(idir, opts['aggregate_all_min_rows'],
                              opts['aggregate_n_rows'])
            aa.process(prefix)
            aa.write_csv(output_file)
        log('finished')
Ejemplo n.º 5
0
    def __init__(self, configFile, dbConfigFile, mode='normal'):
        config = ConfigTyped(configFile, mode)
        opts = config.opts
        dbconfig = ConfigTyped(dbConfigFile, mode)
        dbopts = dbconfig.opts
        self.opts = opts
        self.dbopts = dbopts
        thisDir = os.path.dirname(os.path.abspath(__file__))
        aa_dir = opts['aggregate_all_dir']
        af_dir = opts['add_fields_dir']
        table_files = {
            'aggregate_comments': join(aa_dir, 'RC_aggregate_all.csv'),
            'aggregate_posts': join(aa_dir, 'RS_aggregate_all.csv'),
            'comments': join(af_dir, 'comments.csv'),
            'posts': join(af_dir, 'posts.csv'),
        }
        missing_files = [f for f in table_files.values() if not isfile(f)]
        if len(missing_files):
            log('ERROR: missing files: ' + str(missing_files))
            sys.exit()
        log('5-load-db')

        other_engine = create_engine(dbconfig.get_connectString(
            dbopts['other_db_name']),
                                     pool_pre_ping=True)
        with other_engine.connect() as con:
            con.execute('COMMIT;')
            result = con.execute(
                f"SELECT 1 FROM pg_database WHERE datname='{dbopts['db_name']}'"
            )
            if (not result.fetchone()):
                con.execute(f"CREATE DATABASE {dbopts['db_name']};")
        engine = create_engine(dbconfig.get_connectString(dbopts['db_name']),
                               pool_pre_ping=True)
        if not engine.dialect.has_schema(engine, dbopts['db_schema_tmp']):
            engine.execute(CreateSchema(dbopts['db_schema_tmp']))
        else:
            with engine.connect() as con:
                con.execute(f"""
                        DROP SCHEMA {dbopts['db_schema_tmp']} CASCADE;
                    """)
            engine.execute(CreateSchema(dbopts['db_schema_tmp']))
        for table, input_file in table_files.items():
            log(table)
            conn = psycopg2.connect(
                dbconfig.get_connectString_psy(dbopts['db_name']))
            cur = conn.cursor()
            cur.execute(f"""
                    CREATE TABLE {dbopts['db_schema_tmp']}.{table} ({columns[table]});
                """)
            conn.commit()
            with open(input_file, 'r') as f:
                next(f)  # Skip the header row
                cur.copy_expert(
                    f"""
                        COPY {dbopts['db_schema_tmp']}.{table} FROM STDIN WITH CSV
                    """, f)
                conn.commit()
            cur.close()
            conn.close()
        log('5-load-db finished.')
Ejemplo n.º 6
0
 def __init__(self, configFile, variableName, mode='normal'):
     config = ConfigTyped(configFile, mode)
     print(config.opts[variableName], end='')
Ejemplo n.º 7
0
    def __init__(self, dbConfigFile, mode='normal'):
        dbconfig = ConfigTyped(dbConfigFile, mode)
        dbopts = dbconfig.opts

        engine = create_engine(dbconfig.get_connectString(dbopts['db_name']),
                               pool_pre_ping=True)
        if engine.dialect.has_schema(engine, dbopts['db_schema_tmp']):
            with engine.connect() as con:
                con.execute('COMMIT;')
                con.execute(f"""
                        DO $$ DECLARE
                            r RECORD;
                        BEGIN
                            -- if the schema you operate on is not "current", you will want to
                            -- replace current_schema() in query with 'schematodeletetablesfrom'
                            -- *and* update the generate 'DROP...' accordingly.
                            FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = current_schema()) LOOP
                                EXECUTE 'DROP TABLE IF EXISTS ' || quote_ident(r.tablename) || ' CASCADE';
                            END LOOP;
                        END $$;
                    """)
                con.execute(f"""
                        DO $$ DECLARE
                            r RECORD;
                        BEGIN
                            -- if the schema you operate on is not "current", you will want to
                            -- replace current_schema() in query with 'schematodeletetablesfrom'
                            -- *and* update the generate 'DROP...' accordingly.
                            FOR r IN (SELECT tablename FROM pg_tables WHERE schemaname = '{dbopts['db_schema_tmp']}') LOOP
                                EXECUTE 'ALTER TABLE {dbopts['db_schema_tmp']}.' || quote_ident(r.tablename) || ' SET SCHEMA public';
                            END LOOP;
                        END $$;
                    """)
                con.execute(f"""
                        DROP SCHEMA {dbopts['db_schema_tmp']} CASCADE;
                    """)
        engine = create_engine(dbconfig.get_connectString(dbopts['db_name']),
                               pool_pre_ping=True)
        with engine.connect() as con:
            log('6-create-db-functions indices start')
            ## Not using these two foreign keys b/c aggregate_ tables may not have
            ## corresponding comments/posts entries for data that couldn't
            ## be downloaded from pushshift via the AddFields process.
            ## As a result, when pulling data, aggregate_ tables have some scores that don't appear
            ## in comments/posts, so scores should be pulled from aggregate_ table
            ## So, the score field copied to comments/posts is not used now, and left as-is in case needed later

            #ALTER TABLE aggregate_comments ADD CONSTRAINT fk_mpri_id_c FOREIGN KEY (id_of_max_pos_removed_item) REFERENCES comments (id);
            #ALTER TABLE aggregate_posts ADD CONSTRAINT fk_mpri_id_p FOREIGN KEY (id_of_max_pos_removed_item) REFERENCES posts (id);

            ## Similarly, b/c PS queries fail to download some posts data, this constraint isn't possible to include,

            # 'ALTER TABLE comments ADD CONSTRAINT fk_post_id FOREIGN KEY (link_id) REFERENCES posts (id);'
            for index in indexes:
                try:
                    con.execute(index)
                    log('Successful: ' + index)
                except:
                    pass
                    #log('WARNING: Index creation failed (it may already exist)')
                    #log('         '+index)
            log('6-create-db-functions functions start')
            con.execute(f"""
DROP FUNCTION IF EXISTS getCommentUpvoteRemovedRatesByRate;
DROP FUNCTION IF EXISTS getCommentUpvoteRemovedRatesByDate;

DROP TABLE IF EXISTS commentColumnsReturnTypes;
CREATE TABLE commentColumnsReturnTypes ( {commentColumnsReturnTypes} );

CREATE OR REPLACE function
  getCommentUpvoteRemovedRatesByRate(subreddit VARCHAR(30), num_records integer) RETURNS SETOF commentColumnsReturnTypes
AS
$$
BEGIN
  RETURN query execute 'SELECT {commentColumnsSelectSQL}
    {commentFromJoinWhereSQL}
        ORDER BY rate DESC LIMIT $2)
    ORDER BY rate DESC'
  USING subreddit, num_records;
END;
$$ language plpgsql STABLE;

CREATE OR REPLACE function
  getCommentUpvoteRemovedRatesByDate(subreddit VARCHAR(30), num_records integer) RETURNS SETOF commentColumnsReturnTypes
AS
$$
BEGIN
  RETURN query execute 'SELECT {commentColumnsSelectSQL}
    {commentFromJoinWhereSQL}
        ORDER BY last_created_utc DESC LIMIT $2)
    ORDER BY last_created_utc DESC'
  USING subreddit, num_records;
END;
$$ language plpgsql STABLE;

DROP FUNCTION IF EXISTS getPostUpvoteRemovedRatesByRate;
DROP FUNCTION IF EXISTS getPostUpvoteRemovedRatesByDate;

DROP TABLE IF EXISTS postColumnsReturnTypes;
CREATE TABLE postColumnsReturnTypes ( {postColumnsReturnTypes} );

CREATE OR REPLACE function
  getPostUpvoteRemovedRatesByRate(subreddit VARCHAR(30), num_records integer) RETURNS SETOF postColumnsReturnTypes
AS
$$
BEGIN
  RETURN query execute 'SELECT {postColumnsSelectSQL}
    {postFromJoinWhereSQL}
        ORDER BY rate DESC LIMIT $2)
    ORDER BY rate DESC'
  USING subreddit, num_records;
END;
$$ language plpgsql STABLE;

CREATE OR REPLACE function
  getPostUpvoteRemovedRatesByDate(subreddit VARCHAR(30), num_records integer) RETURNS SETOF postColumnsReturnTypes
AS
$$
BEGIN
  RETURN query execute 'SELECT {postColumnsSelectSQL}
    {postFromJoinWhereSQL}
        ORDER BY last_created_utc DESC LIMIT $2)
    ORDER BY last_created_utc DESC'
  USING subreddit, num_records;
END;
$$ language plpgsql STABLE;

UPDATE pg_language SET lanvalidator = 2247 WHERE lanname = 'c';

""".replace('%', '%%'))
        log('finished')