Ejemplo n.º 1
0
 def headlerlistcreation(headeritem):
     logger = Utils.add_logger()
     colist = []
     headerlist = []
     headeritem = [
         x for x in headeritem if x not in ['Management', 'Opposition']
     ]
     # print("headeritem:{}".format(headeritem))
     length = len(headeritem)
     try:
         for i in range(0, length + 1):
             if (i % 2 == 0):
                 headerlist.append(headeritem[i])
             else:
                 if "Date" in headeritem[i] or "Agenda" in headeritem[i]:
                     headeritem.insert(i, " ")
                     colist.append("  ")
                     length += 1
                 else:
                     colist.append(headeritem[i])
     except:
         logger.debug("colList:{}".format(colist))
         #print("colList:{}".format(colist))
     #print("headeritem after loop:{}".format(headerlist))
     #print("collist:{}".format(colist))
     alist = []
     for i, j in zip(headerlist, colist):
         alist.append(i)
         alist.append(j)
     #print(alist)
     return alist
    def preprocess(accession_no):
        logger = Utils.add_logger()
        try:
            path = os.path.join(CONFIG["Path"]["file_path"],
                                accession_no + ".dissem")
            file = open(path, 'r', encoding='utf-8')
            s = file.read()
            raw = bs(s, "lxml")
            val = raw.get_text()
            result = raw.find_all(text=re.compile("NAME OF REGISTRANT"))
            for val in result:
                v = val.split('\n')
            for i, str in enumerate(v):
                if "NAME OF REGISTRANT" in str:
                    temp = v[i].split(':')
            Registrant_Name = temp[1].strip()
            logging.debug("Registrant Name for file:{} is {}".format(
                accession_no, Registrant_Name))
            res = raw.find_all(text=re.compile(Registrant_Name))
            res = raw.getText()
            text = ""
            for value in res:
                text += value
            result_list = re.split(r"-{20,}", text)
        except Exception as error:
            logger.error("Error in preprocess for file-{}-{}".format(
                accession_no, error))
            return list(), ""

        return result_list, Registrant_Name
Ejemplo n.º 3
0
 def splitevenodd(headeritem, companyName):
     logger = Utils.add_logger()
     colist = []
     headerlist = []
     headeritem = [
         x for x in headeritem if x not in ['Management', 'Opposition']
     ]
     # print("headeritem:{}".format(headeritem))
     length = len(headeritem)
     try:
         for i in range(0, length + 1):
             # print(length)
             if (i % 2 == 0):
                 headerlist.append(headeritem[i])
             else:
                 if "Date" in headeritem[i] or "Agenda" in headeritem[i]:
                     headeritem.insert(i, " ")
                     colist.append("  ")
                     length += 1
                 else:
                     colist.append(headeritem[i])
     except:
         logger.debug("colList:{}".format(colist))
         #print("colList:{}".format(colist))
     dfs = pd.DataFrame(columns=headerlist)
     # print("headeritem after loop:{}".format(headerlist))
     # print("collist:{}".format(colist))
     dfs = dfs.append(pd.Series(colist, index=headerlist),
                      ignore_index=True)
     #print("CompanyName={}".format(companyName))
     if len(companyName) >= 1:
         dfs['CompanyName'] = companyName
     #print(dfs)
     return dfs
 def process(self, accession_no, doc_id):
     logger = Utils.add_logger()
     try:
         path = CONFIG["Path"]["file_path"] + accession_no + ".dissem"
         content = Utils.read_file(path, accession_no)
         content = Format6parser.segparsing(content)
         content = content.replace("</td>\n<tr>", "</td></tr><tr>")
         soup = BeautifulSoup(content, "html.parser")
         tables = soup.findAll('table')
         dffa = None
         prev_table = None
         finaldf = pd.DataFrame()
         for table in tables:
             try:
                 if Format6APipeline.check_table(table):
                     if prev_table is not None:
                         prev_table = prev_table + str(table.contents)
                         dffa = Format6parser.tabledetails(
                             BeautifulSoup(prev_table, "html.parser"))
                         prev_table = None
                     else:
                         dffa = Format6parser.tabledetails(table)
                 else:
                     prev_table = str(table.contents)
                 #print('######################')
                 if dffa is not None:
                     if finaldf.empty == True:
                         finaldf = dffa
                         dffa = None
                     else:
                         #print(dffa.head())
                         finaldf = finaldf.append(dffa, sort=True)
                         dffa = None
             except Exception as error:
                 logger.error(
                     "Exception in table parsing for format6 for accession_no -{} -{}"
                     .format(accession_no, error))
         filename = os.path.join(CONFIG['Path']['output_path_format6B'],
                                 accession_no + '.xlsx')
         Utils.df_to_excel(accession_no, finaldf, filename)
         return 1
     except Exception as error:
         logger.error(
             "Exception in format6 pipeline for accession_no -{}-{}".format(
                 accession_no, error))
         print("Exception in format6A pipeline-{}".format(error))
         return 0
Ejemplo n.º 5
0
 def process(self,accession_no,doc_id):
     logger = Utils.add_logger()
     try:
         path = CONFIG["Path"]["file_path"] + accession_no +".dissem"
         content = Utils.read_file(path,accession_no)
         content = content.replace('&#160;', '\xa0')
         soup = BeautifulSoup(content, "html.parser")
         contents = soup.text
         contents = contents.replace('SIGNATURES', '----------------------------------')
         contents = contents.replace('FUND:', '############### \n FUND:')
         fundlist = contents.split('###############')
     except Exception as error:
         logger.error("Exception in parsing through bs4 for accession_no -{} -{}".format(accession_no,error))
         fundlist = []
         return "unsuccessfull"
     i = 0
     final_df = pd.DataFrame()
     # print(fundlist[1])
     for list in fundlist:
         if i > 0:
             sr = list.split('ISSUER')
             fName = Format4Parser.fundName(sr[0])
             #print(fName)
             df = Format4Parser.companyfundDetails(list, fName)
             try:
                 final_df = final_df.append(df)
             except Exception as error:
                 logger.error("Exception in pipeline for df merging for accession_no -{} -{}".format(accession_no,error))
                 return "unsuccessfull"
         i = 1
     final_df = Format4Parser.concat_rowData(final_df)
     final_df = final_df.drop(columns=['ID'])
     final_df.rename(columns={"ISSUER":"CompanyName","MEETING DATE":"MeetingDate","Proposal No":"ProposalNumber",
                              "PROPOSAL":"Proposal","PROPOSED BY":"ProposedBy","VOTED?":"Voted",
                              "MGMT":"ForAgainstManagement","TICKER":"Ticker","VOTE CAST":"VoteCast"},inplace=True)
     final_df["MeetingDate"] = final_df["MeetingDate"].apply(Utils.parsing_date)
     final_df["AccesssionNumber"] = accession_no
     final_df["DocumentId"] = doc_id
     Utils.df_to_database(accession_no,doc_id,final_df)
     filename = os.path.join(CONFIG["Path"]["output_path_format4"], accession_no + ".xlsx")
     Utils.df_to_excel(accession_no,final_df,filename)
     return 1
    def process_df(accession_no, main_header_df, table_df):
        logger = Utils.add_logger()
        try:
            table_df = table_df.astype({"seq_id": int})
            main_header_df = main_header_df.reset_index(drop=True)
            main_header_df = main_header_df.astype({"seq_id": int})
            df = table_df.dropna(subset=['Proposal'])
            df["ProposedBy"] = df["ProposedBy"].apply(Parser.replace_empty)
            df["ProposalNumber"] = df["ProposalNumber"].apply(
                Parser.replace_empty)
            main_header_df["MeetingDate"] = main_header_df[
                "MeetingDate"].apply(Utils.parsing_date)
            df = df.reset_index(drop=True)
            r = len(df.index) - 1
            #print(r)
            while r >= 0:
                temp_index = list()
                while ((df.at[r, "Proposal"] != "DIRECTOR"
                        or df.at[r, "ProposalNumber"] == None)
                       and (df.at[r, "ProposedBy"] == None)):
                    temp_index.append(r)
                    r -= 1
                #print(temp_index)
                r -= 1
                if len(temp_index) >= 1:
                    c = temp_index[-1]
                    # print(c)
                    for m in reversed(temp_index):
                        df.at[c - 1, "Proposal"] += " " + df.at[m, "Proposal"]
                        df.at[m, "Proposal"] = None

            df = df.dropna(subset=["Proposal"])
            final_df = main_header_df.merge(df, on=["seq_id"], how="inner")
        except Exception as error:
            logger.error("Error in process_df for accession_no-{}-{}".format(
                accession_no, error))
            print("Error in df processing for accession_no-{}-{}".format(
                accession_no, error))
            return pd.DataFrame()

        return final_df
Ejemplo n.º 7
0
 def process(self, accession_no, doc_id):
     logger = Utils.add_logger()
     text_list, registrant_name = Parser.preprocess(accession_no)
     if len(text_list) > 0:
         funds_list = Parser.parse_file(accession_no, text_list, doc_id)
         if len(funds_list) > 0:
             header_df, table_df = Parser.post_process(
                 accession_no, funds_list)
             if header_df.empty:
                 logging.debug(
                     "header_df is empty for accession_no:{}".format(
                         accession_no))
             else:
                 final_df = Parser.process_df(accession_no, header_df,
                                              table_df)
                 if final_df.empty:
                     logging.debug(
                         "final_df is empty for accession_no:{}".format(
                             accession_no))
                 else:
                     logging.debug(
                         "final_df created successfully for accession_no:{}"
                         .format(accession_no))
                     print("final_df length:{}".format(len(final_df.index)))
                     final_df = final_df.drop(columns=["seq_id"])
                     Utils.df_to_database(accession_no, doc_id, final_df)
                     filename = os.path.join(
                         CONFIG["Path"]["output_path_format2"],
                         accession_no + ".xlsx")
                     Utils.df_to_excel(accession_no, final_df, filename)
                     return 1
     else:
         logger.debug(
             "No data found from the file for accssion_no:{}".format(
                 accession_no))
         return "Unsuccessfull"
Ejemplo n.º 8
0
 def tabledetails(table, lstdf, accession_no):
     logger = Utils.add_logger()
     line = []
     temp = False
     length = 0
     head = 0
     tempf = 0
     fund = ''
     parseline = []
     checkdata = []
     table_rows = table.find_all('tr')
     for tr in table_rows:
         try:
             td = tr.find_all('td')
             for i in td:
                 line.append(i.text.replace('\n', '').replace('\xa0', ' '))
                 checkdata.append(i.text.strip().replace('\n', '').replace(
                     '\xa0', ' '))
             line = [(lambda x: x.strip())(l) for l in line if l != ' ']
             #print(line)
             # if "QSR" in line or  "FP" in line:
             #     print("Line=={}".format(line))
             if line == [] or len(line) == line.count(''):
                 line = []
                 continue
             if temp == False:
                 temp = Format8Parser.identify_header(line, accession_no)
                 head = head + 1
                 # print('head ' + str(head))
                 # print('temp ' + str(temp))
                 # print('temp ' + str(len(line)))
                 # print(len(lstdf))
                 if temp == False and head == 1 and len(lstdf) != 0:
                     temp = True
                     parseline = line
                     if len(lstdf) == len(line):
                         line = lstdf
                     else:
                         lstdf = list(lstdf)
                         abcd = set(lstdf)
                         if 'Fund Name' in abcd:
                             lstdf.remove('Fund Name')
                         if 'FundName' in abcd:
                             lstdf.remove('FundName')
                         line = lstdf
                 if temp == True:
                     line = [(lambda x: re.sub(' +', ' ', x))(l)
                             for l in line if l != ' ']
                     df = pd.DataFrame(columns=line)
                     col = line
                     length = len(line)
                     if parseline != [] and len(parseline) == len(line):
                         df = df.append(pd.Series(parseline, index=col),
                                        ignore_index=True)
                     #print("header")
                     #print(line)
                     #print(length)
                     ##print(df)
             elif length == len(line):
                 df = df.append(pd.Series(line, index=col),
                                ignore_index=True)
             elif length == len(checkdata):
                 df = df.append(pd.Series(checkdata, index=col),
                                ignore_index=True)
             line = []
             checkdata = []
         except Exception as error:
             logger.error(
                 "Exception in format8 parser for accession_no:{} -{}".
                 format(accession_no, error))
             print("Exception in format8 parser:{}".format(error))
     if temp == False:
         df = pd.DataFrame()
     else:
         for col in df.columns:
             if col.upper().find('FUND NAME') > -1:
                 tempf = 1
         if tempf != 1 and fund != '':
             df["Fund Name"] = fund
     return df
Ejemplo n.º 9
0
 def process(self, accession_no, doc_id):
     logger = Utils.add_logger()
     try:
         olderFundName = ''
         lstdf = []
         dfs_all = pd.DataFrame()
         path = CONFIG["Path"]["file_path"] + accession_no + ".dissem"
         content = Utils.read_file(path, accession_no)
         content = Format8Parser.segparsing(content)
         # print(content)
         content = re.sub("=+", "", content)
         content = content.replace("<TABLE",
                                   "@@@@@@@@@@@@@@@@@@@@@\n <TABLE")
         content = content.replace("<table",
                                   "@@@@@@@@@@@@@@@@@@@@@\n <table")
         content = content.replace("</TABLE>",
                                   "</TABLE>\n ######################")
         content = content.replace("</table>",
                                   "</table>\n ######################")
         content = content.replace("</td>\n<tr>", "</td></tr><tr>")
         content = content.replace(
             "The Fund did not vote any proxies during this reporting period",
             "The Fund did not vote any proxies during this reporting period\n ######################"
         )
         fundVoteData = content.split('######################')
         for alist in fundVoteData:
             try:
                 #print("---------------------------------------------------------------")
                 # print(alist,lstdf)
                 # print(alist)
                 df_all = Format8Parser.tableparsed(alist, olderFundName,
                                                    lstdf, accession_no)
                 # print(df_all)
                 if df_all is not None:
                     dfs_batch = pd.DataFrame()
                     # col_list = df_all.columns
                     # col_list = [(lambda x: re.sub(' +',' ',x))(l) for l in col_list]
                     # df_all.columns = col_list
                     dfs_batch = Format8Parser.formDataFrame(df_all)
                     if dfs_batch.empty != True:
                         if dfs_all.empty == True:
                             # print(dfs_batch)
                             dfs_all = dfs_batch
                         else:
                             #print(dfs_batch)
                             dfs_all = dfs_all.append(dfs_batch,
                                                      ignore_index=True,
                                                      sort=False)
                     if "FundName" in dfs_all.columns:
                         olderFundName = dfs_all["FundName"][0]
                         lstdf = df_all.columns
                     # print(dfs_batch)
                     # print(lstdf)
             except Exception as error:
                 logger.error(
                     "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}"
                     .format(accession_no, error))
                 print(
                     "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}"
                     .format(accession_no, error))
         #print('------------Final -----------------')
         #print(dfs_all)
         #print("Column names:{}".format(dfs_all.columns))
         if dfs_all is not None:
             dfs_all = Format8Parser.remove_spaces_from_df(dfs_all)
         filename = os.path.join(CONFIG['Path']['output_path_format8'],
                                 accession_no + '.xlsx')
         Utils.df_to_excel(accession_no, dfs_all, filename)
         return 1
     except Exception as error:
         logger.error(
             "Exception in Format8pipeline.process for accessionnumber:{}-{}"
             .format(accession_no, error))
         print(
             "Exception in Format8pipeline.process for accessionnumber:{}-{}"
             .format(accession_no, error))
         return 0
    def parse_file(accession_no, text_list, doc_id):
        logger = Utils.add_logger()
        try:
            funds_list = list()
            seq_id = 1  # sequencing
            prev_fundname = ""
            for index, temp in enumerate(text_list):
                split = re.split("\n", temp)
                for x, spli in enumerate(split):
                    num = x + 3
                    table_list = list()
                    if "Proposal Vote" in spli:
                        #print("Table Data")
                        row_list = list()
                        table_list.append(seq_id - 1)
                        while (num < (len(split) - 2)) and (
                                case3
                                not in split[num]) and (case4
                                                        not in split[num]):
                            row_text = split[num]
                            # print(row_text)
                            if len(row_text) > 0:
                                temp_text = ""
                                temp_text += row_text[:6].strip()
                                temp_text += "\t" + row_text[6:65].strip()
                                temp_text += "\t" + row_text[65:79].strip()
                                temp_text += "\t" + row_text[79:110].strip()
                                temp_text += "\t" + row_text[110:].strip()
                                table_list.append(temp_text)
                                #print(temp_text)
                            num += 1
                        funds_list.append(table_list)
                    sep = re.split("\s{3,}", spli)
                    # print(sep)
                    header_list = list()
                    for i, s in enumerate(sep):
                        # print(s)
                        if "Agenda Number:" in s:
                            fund = text_list[index - 1]
                            # print(fund)
                            fund_name = re.split(r'\s{2,}', fund)
                            fund_name = fund_name[-1]
                            fund_name = re.sub("\n", "", fund_name)
                            fund_name = fund_name.strip()
                            if len(fund_name) > 1:
                                prev_fundname = fund_name
                            else:
                                fund_name = prev_fundname
                            #print("fund_name:{}".format(fund_name))
                            t = s.split(':')
                            agenda_number = t[1].strip()
                            inc_name = sep[i - 1]
                            inc_name = inc_name.strip()
                            #print("Company Name:{} \nAgenda_number:{}".format(inc_name, agenda_number))
                        elif "Security:" in s:
                            t = s.split(':')
                            security = t[1].strip()
                            #print("Security:", security)
                        elif "Meeting Type:" in s:
                            t = s.split(':')
                            meeting_type = t[1].strip()
                            #print("Meeting_type:", meeting_type)
                        elif "Meeting Date:" in s:
                            t = s.split(':')
                            meeting_date = t[1].strip()
                            #print("Meeting_date:", meeting_date)
                        elif "Ticker:" in s:
                            t = s.split(':')
                            ticker = t[1].strip()
                            #print("Ticker:", ticker)
                        elif "ISIN:" in s:
                            t = s.split(':')
                            ISIN = t[1].strip()
                            #print("ISIN:", ISIN)
                            #print("")
                            header_list.append(seq_id)
                            header_list.append(doc_id)
                            header_list.append(accession_no)
                            header_list.append(fund_name)
                            header_list.append(inc_name)
                            header_list.append(security)
                            header_list.append(meeting_date)
                            header_list.append(meeting_type)
                            header_list.append(ISIN)
                            header_list.append(ticker)
                            header_list.append(agenda_number)
                            funds_list.append(header_list)
                            seq_id += 1
        except Exception as error:
            logger.error(
                "Error in parsing the file for accession_no-{}-{}".format(
                    accession_no, error))
            print("Error in parsing file for accession_no-{}-{}".format(
                accession_no, error))
            return list()

        return funds_list
    def post_process(accession_no, funds_list):
        logger = Utils.add_logger()
        try:
            seq_id = 1
            main_header_df = pd.DataFrame(columns=[
                "seq_id", "DocumentId", "AccesssionNumber", "FundName",
                "CompanyName", "SecurityId", "MeetingDate", "MeetingType",
                "ISIN", "Ticker", "AgendaNumber"
            ])
            table_df = pd.DataFrame(columns=[
                "seq_id", "ProposalNumber", "Proposal", "ProposedBy",
                "ForAgainstManagement", "VoteCast"
            ])
            for i, data in enumerate(funds_list):
                if i % 2 == 0:
                    #print("<-----Header Data----->")
                    temp = np.asarray(data)
                    temp = temp.reshape(1, 11)
                    header_df = pd.DataFrame(temp,
                                             columns=[
                                                 "seq_id", "DocumentId",
                                                 "AccesssionNumber",
                                                 "FundName", "CompanyName",
                                                 "SecurityId", "MeetingDate",
                                                 "MeetingType", "ISIN",
                                                 "Ticker", "AgendaNumber"
                                             ])
                    main_header_df = main_header_df.append(header_df)
                    #print(data)
                    #print()
                elif i % 2 != 0:
                    #print("<------Table Data----->")
                    for val in data:
                        row_list = list()
                        row_list.append(seq_id)
                        if not isinstance(val, int):
                            # val = re.sub(r"")
                            split = re.split(r"\t", val)
                            for value in split:
                                row_list.append(value.lstrip())
                        else:
                            seq_id = val
                        tp = np.asarray(row_list)
                        if len(tp) < 6:
                            for x in range(len(tp), 6):
                                tp = np.append(tp, None)
                        # print(tp)
                        tp = tp.reshape(1, 6)
                        # print(tp)
                        temp_df = pd.DataFrame(tp,
                                               columns=[
                                                   "seq_id", "ProposalNumber",
                                                   "Proposal", "ProposedBy",
                                                   "ForAgainstManagement",
                                                   "VoteCast"
                                               ])
                        table_df = table_df.append(temp_df)
                    #print(data)
                    #print()
        except Exception as error:
            logger.error(
                "Error in post-processing for accession_no-{}-{}".format(
                    accession_no, error))
            print("Error in post-processing for accession_no-{}-{}".format(
                accession_no, error))
            return pd.DataFrame(), pd.DataFrame()

        return main_header_df, table_df