def download_formadvs():
    br = HomeBrowser(starturl = r'https://www.sec.gov/help/foiadocsinvafoiahtm.html')
    for linktag in br.filterlinks(r'\d{6}\.zip'):
        url = linktag.url
        link = "https://www.sec.gov/%s" % url
        _ = OSPath.split(link)[-1]
        br.download(link, outfile = mkpath(zipfolder, _))
def read_dailyxml():
    
    f = gzip.open(get_dailyxml_path(), 'rb')
    for chunk in chunker(f, chunksize = 100):
        appendData(mkpath(xmlfolder, 'daily.xml'), ''.join(chunk))
    f.close() #catch IOerror
            
    return pd.DataFrame(parsexml('dailyxml.xml', 'Firm', 'FormInfo'))
Exemple #3
0
def load_scheduleDs():
    folders = [folder for folder in Folder.listdir('data', pattern=r'\d+$')]

    for folder in folders:
        try:
            data = from_json(mkpath(folder, 'predictiveops.json'))
            load_scheduleD(data)
        except IOError:
            continue
Exemple #4
0
 def normfile(self, formadv, dailyxml=False, **kwds):
     df = read_formadv(formadv.filename)
     df = self.normdf(df, formadv, **kwds)
     self.writefile(df, mkpath(preprocessed, formadv.outfile))
     return df
Exemple #5
0
class FormadvStage(Stage):
    FIELDSPATH = mkpath('config', 'fieldsconfig.json')

    def __init__(self):
        super(FormadvStage, self).__init__('formadv')

    @classmethod
    def processfiles(cls, start=1, **kwds):
        advprsr = cls()
        advprsr.info("Starting at entry number {}".format(start))
        for formadv in db.FormADV.select():
            if formadv.id >= start:
                advprsr.info("Currently processing '{}'".format(
                    formadv.filename))
                advprsr.normfile(formadv)

    @staticmethod
    def get_number(df, field='numberofclients'):
        data = df[field].copy()
        mask = data.notnull()
        data.loc[data.contains(re_NUMBERSPECIFY)] = np.nan
        __ = df.loc[mask, '{}_specify'.format(field)]
        return data.modify(mask, data.fillna(__)).quickmap(numericrank)

    @staticmethod
    def cleantext(text, key):
        if text.startswith(key):
            return ' '.join(
                i.capitalize()
                for i in text.replace("{}_".format(key), '').split('_'))
        return to_single_space(text)

    @staticmethod
    def get_types(df):
        categories = ['client_types', 'compensation', 'pct_aum', 'disclosures']
        fields = ['adviser', 'text', 'specific', 'percentage']
        typesmap = {'descriptions': []}
        for key in categories:
            data = df.filter(regex=key).stack().reset_index()
            if data.empty:
                continue

            __maps = {}
            for _key in (
                    'specify',
                    'other',
            ):
                __ = data.level_1.contains("{}_(?:other_)?{}$".format(
                    key, _key))
                __maps.update({
                    _key: {
                        'map': data.loc[__].get_mapper('level_0', 0),
                        'mask': __
                    }
                })

            mask_o = __maps['other']['mask']
            map_s = __maps['specify']['map']
            map_o = __maps['other']['map']

            descriptions = data.assign(
                text=data.level_1.modify(mask_o,
                                         data.level_0.map(map_s)).quickmap(
                                             FormadvStage.cleantext, key),  #
                specific=data.level_1.modify(mask_o, True, elsevalue=False),  #
                adviser=data.level_0.map(df.crd.to_dict()),  #
                percentage=data[0].quickmap(percentrank)  #
            ).ix[:, fields]

            qty = descriptions.percentage.to_numeric(force=True)
            if key == 'disclosures':
                descriptions['number'] = qty
            else:
                descriptions['percentage'] = qty

            dropmask = (descriptions.text !=
                        'Other Specify') & (qty != 0) & (qty.notnull())
            descriptions = descriptions.loc[dropmask].dropna()
            typesmap['descriptions']\
                .extend(descriptions\
                    .ix[:, ['text', 'specific']]\
                    .drop_duplicates(subset = ['text'])\
                    .to_dict(orient = 'records'))

            typesmap.update({
                key : descriptions.loc[dropmask]\
                    .dropna().rename(columns = {'text' : 'description'})
                    })

        typesmap['descriptions'] = [
            dict(t)
            for t in {tuple(d.items())
                      for d in typesmap['descriptions']}
        ]

        return typesmap

    @staticmethod
    def addnames(df):
        if hasattr(df, 'contactperson'):
            df = pd.concat([df, df.contactperson.to_name()], axis=1)
        return df

    def normdf(self, df, formadv, **kwds):
        df = super(FormadvStage, self).normdf(df, **kwds)
        nflds = self.numeric_fields
        num = df[nflds].copy()
        if _num.any(axis=1).any():  #these did not provide a value
            df[nflds] = num.fillna(0)

        return df.assign(
            formadv=formadv.id,
            adviser=df.crd,
            numberofclients=self.get_number(df),
            numberofemployees=self.get_number(df, field='numberofemployees'),
            date=formadv.date,
        ).clean_addresses().addnames()

    def writefile(self, df, outfile, **kwds):
        while True:
            try:
                df.to_csv(outfile, index=False, **kwds)
                break
            except UnicodeEncodeError as e:
                self.error("Encoding troubles")
                self.error(e)
                kwds['encoding'] = 'utf-8'

    def normfile(self, formadv, dailyxml=False, **kwds):
        df = read_formadv(formadv.filename)
        df = self.normdf(df, formadv, **kwds)
        self.writefile(df, mkpath(preprocessed, formadv.outfile))
        return df
def get_dailyxml_path():
    return OSPath.abspath(
        mkpath(xmlfolder,
            utcnow().strftime(r'IA_FIRM_SEC_Feed_%m_%d_%Y.xml.gz')
                ))
 def get_outfile(date):
     return mkpath(preprocessed_folder, date.strftime("%m%d%y_output.csv"))