# Put into a dataframe: df = pd.DataFrame(data=contents[1:]) df.columns = contents[0] # Reindex using `docid`; # This lets us gauge posting dates even if that's missing for a doc: dfindex = [] for doc in df.docid: dfindex.append(int(re.sub('[a-z]*$', '', doc))) df.index = dfindex df.sort_index(inplace=True) if file == '43-heller.csv': df = Config.fix_heller_dates(df) # Subset to 2013-2016: df['form_date'] = pd.to_datetime(df['form_date'], yearfirst=True) validdates = df['form_date'] >= pd.to_datetime('2013-01-03', yearfirst=True) lastvalid = max(validdates.index.where(validdates == True)) df = df.loc[:lastvalid, :] # Drop empty docs: notempty = df['clean_text'] != '' df = df.loc[notempty, :] n_docs = len(df)