Beispiel #1
0
async def add_fields_from_incident_url(df, args, predicate=None):
    log_first_call()

    def field_name(lst):
        assert len(set([field.name for field in lst])) == 1
        return lst[0].name

    def field_values(lst):
        return [field.value for field in lst]

    subset = df if predicate is None else df.loc[predicate]
    if len(subset) == 0:
        # No work to do
        return df

    async with Stage2Session(limit_per_host=args.conn_limit) as session:
        # list of coros of tuples of Fields
        tasks = subset.apply(session.get_fields_from_incident_url, axis=1)
        # list of (tuples of Fields) and (exceptions)
        fields = await asyncio.gather(*tasks, return_exceptions=True)

    # Temporarily suppress Pandas' SettingWithCopyWarning
    pd.options.mode.chained_assignment = None
    try:
        incident_url_fields_missing = [
            isinstance(x, Exception) for x in fields
        ]
        subset['incident_url_fields_missing'] = incident_url_fields_missing

        not_found = [
            isinstance(x, ClientResponseError) and x.code == 404
            for x in fields
        ]

        # list of tuples of Fields
        fields = [
            NIL_FIELDS if isinstance(x, Exception) else x for x in fields
        ]

        # tuple of lists of Fields, where each list's Fields should have the same name
        # if the extractor did its job correctly
        fields = zip(*fields)
        fields = [(field_name(lst), field_values(lst)) for lst in fields]

        for field_name, field_values in fields:
            assert subset.shape[0] == len(field_values)
            subset[field_name] = field_values

        subset = subset.astype(SCHEMA)
    finally:
        pd.options.mode.chained_assignment = 'warn'

    if predicate is not None:
        df.loc[subset.index] = subset
        df.drop(index=subset.index[not_found], inplace=True)

    return df
Beispiel #2
0
def add_incident_id(df):
    log_first_call()

    def extract_id(incident_url):
        PREFIX = 'http://www.gunviolencearchive.org/incident/'
        assert incident_url.startswith(PREFIX)
        return int(incident_url[len(PREFIX):])

    df.insert(0, 'incident_id', df['incident_url'].apply(extract_id))
    return df
 def get_fields_from_incident_url(self, row, driver):
     log_first_call()
     try:
         return self._get_fields_from_incident_url(row, driver)
     except Exception as exc:
         if isinstance(exc, IpBlocked):
             raise
         pass
         # Passing return_exceptions=True to asyncio.gather() destroys the ability
         # to print them once they're caught, so do that manually here.
         '''if isinstance(exc, ClientResponseError) and exc.code == 404:
Beispiel #4
0
 async def get_fields_from_incident_url(self, row):
     log_first_call()
     try:
         return await self._get_fields_from_incident_url(row)
     except Exception as exc:
         # Passing return_exceptions=True to asyncio.gather() destroys the ability
         # to print them once they're caught, so do that manually here.
         if isinstance(exc, ClientResponseError) and exc.code == 404:
             # 404 is handled gracefully by us so this isn't too newsworthy.
             pass
         else:
             self._log_extraction_failed(row['incident_url'])
             tb.print_exc()
         raise
Beispiel #5
0
    def extract_fields(self, text, ctx):
        #html = json.loads(text)['solution']['response']
        log_first_call()
        soup = BeautifulSoup(text, features='html5lib')
        location_fields = self._extract_location_fields(soup, ctx)
        participant_fields = self._extract_participant_fields(soup)
        incident_characteristics = self._extract_incident_characteristics(soup)
        notes = self._extract_notes(soup)
        guns_involved_fields = self._extract_guns_involved_fields(soup)
        sources = self._extract_sources(soup)
        district_fields = self._extract_district_fields(soup)

        return _normalize([
            *location_fields, *participant_fields,
            Field('incident_characteristics', incident_characteristics),
            Field('notes', notes), *guns_involved_fields,
            Field('sources', sources), *district_fields
        ])
Beispiel #6
0
async def add_fields_from_incident_url(driver, df, args, predicate=None):
    log_first_call()

    def field_name(lst):
        assert len(set([field.name for field in lst])) == 1
        return lst[0].name

    def field_values(lst):
        return [field.value for field in lst]

    subset = df if predicate is None else df.loc[predicate]
    if len(subset) == 0:
        # No work to do
        return df
    async with Stage2Session(limit_per_host=args.conn_limit) as session:
        #ip_is_blocked = False
        global columns
        global incident_ids
        columns = subset.columns.tolist()
        for i in range(len(subset)):
            row = subset.iloc[i]
            row_to_list = row.tolist()
            try:
                extra_fields = session.get_fields_from_incident_url(
                    row, driver)
                if extra_fields:
                    for field_name, field_values in extra_fields:
                        if i == 0:
                            columns.append(field_name)
                        row_to_list.append(field_values)
                data_dict[i] = row_to_list
                incident_ids.append(row_to_list[0])
            except Exception as exc:  #The only exception it raises is IpBlocked
                #ip_is_blocked = True
                print(str(exc))
                break

        df = pd.DataFrame.from_dict(data_dict, orient='index', columns=columns)
    return df
Beispiel #7
0
def load_input(args):
    log_first_call()
    return pd.read_csv(args.input_fname,
                       dtype=SCHEMA,
                       parse_dates=['date'],
                       encoding='utf-8')
def _compute_wait(average_wait, rng_base):
    log_first_call()
    log_average_wait = math.log(average_wait, rng_base)
    fuzz = np.random.standard_normal(size=1)[0]
    return int(np.ceil(rng_base**(log_average_wait + fuzz)))