コード例 #1
0
 def process(self, accession_no, doc_id):
     logger = Utils.add_logger()
     try:
         path = CONFIG["Path"]["file_path"] + accession_no + ".dissem"
         content = Utils.read_file(path, accession_no)
         content = Format6parser.segparsing(content)
         content = content.replace("</td>\n<tr>", "</td></tr><tr>")
         soup = BeautifulSoup(content, "html.parser")
         tables = soup.findAll('table')
         dffa = None
         prev_table = None
         finaldf = pd.DataFrame()
         for table in tables:
             try:
                 if Format6APipeline.check_table(table):
                     if prev_table is not None:
                         prev_table = prev_table + str(table.contents)
                         dffa = Format6parser.tabledetails(
                             BeautifulSoup(prev_table, "html.parser"))
                         prev_table = None
                     else:
                         dffa = Format6parser.tabledetails(table)
                 else:
                     prev_table = str(table.contents)
                 #print('######################')
                 if dffa is not None:
                     if finaldf.empty == True:
                         finaldf = dffa
                         dffa = None
                     else:
                         #print(dffa.head())
                         finaldf = finaldf.append(dffa, sort=True)
                         dffa = None
             except Exception as error:
                 logger.error(
                     "Exception in table parsing for format6 for accession_no -{} -{}"
                     .format(accession_no, error))
         filename = os.path.join(CONFIG['Path']['output_path_format6B'],
                                 accession_no + '.xlsx')
         Utils.df_to_excel(accession_no, finaldf, filename)
         return 1
     except Exception as error:
         logger.error(
             "Exception in format6 pipeline for accession_no -{}-{}".format(
                 accession_no, error))
         print("Exception in format6A pipeline-{}".format(error))
         return 0
コード例 #2
0
 def process(self,accession_no,doc_id):
     logger = Utils.add_logger()
     try:
         path = CONFIG["Path"]["file_path"] + accession_no +".dissem"
         content = Utils.read_file(path,accession_no)
         content = content.replace('&#160;', '\xa0')
         soup = BeautifulSoup(content, "html.parser")
         contents = soup.text
         contents = contents.replace('SIGNATURES', '----------------------------------')
         contents = contents.replace('FUND:', '############### \n FUND:')
         fundlist = contents.split('###############')
     except Exception as error:
         logger.error("Exception in parsing through bs4 for accession_no -{} -{}".format(accession_no,error))
         fundlist = []
         return "unsuccessfull"
     i = 0
     final_df = pd.DataFrame()
     # print(fundlist[1])
     for list in fundlist:
         if i > 0:
             sr = list.split('ISSUER')
             fName = Format4Parser.fundName(sr[0])
             #print(fName)
             df = Format4Parser.companyfundDetails(list, fName)
             try:
                 final_df = final_df.append(df)
             except Exception as error:
                 logger.error("Exception in pipeline for df merging for accession_no -{} -{}".format(accession_no,error))
                 return "unsuccessfull"
         i = 1
     final_df = Format4Parser.concat_rowData(final_df)
     final_df = final_df.drop(columns=['ID'])
     final_df.rename(columns={"ISSUER":"CompanyName","MEETING DATE":"MeetingDate","Proposal No":"ProposalNumber",
                              "PROPOSAL":"Proposal","PROPOSED BY":"ProposedBy","VOTED?":"Voted",
                              "MGMT":"ForAgainstManagement","TICKER":"Ticker","VOTE CAST":"VoteCast"},inplace=True)
     final_df["MeetingDate"] = final_df["MeetingDate"].apply(Utils.parsing_date)
     final_df["AccesssionNumber"] = accession_no
     final_df["DocumentId"] = doc_id
     Utils.df_to_database(accession_no,doc_id,final_df)
     filename = os.path.join(CONFIG["Path"]["output_path_format4"], accession_no + ".xlsx")
     Utils.df_to_excel(accession_no,final_df,filename)
     return 1
コード例 #3
0
 def process(self, accession_no, doc_id):
     logger = Utils.add_logger()
     try:
         olderFundName = ''
         lstdf = []
         dfs_all = pd.DataFrame()
         path = CONFIG["Path"]["file_path"] + accession_no + ".dissem"
         content = Utils.read_file(path, accession_no)
         content = Format8Parser.segparsing(content)
         # print(content)
         content = re.sub("=+", "", content)
         content = content.replace("<TABLE",
                                   "@@@@@@@@@@@@@@@@@@@@@\n <TABLE")
         content = content.replace("<table",
                                   "@@@@@@@@@@@@@@@@@@@@@\n <table")
         content = content.replace("</TABLE>",
                                   "</TABLE>\n ######################")
         content = content.replace("</table>",
                                   "</table>\n ######################")
         content = content.replace("</td>\n<tr>", "</td></tr><tr>")
         content = content.replace(
             "The Fund did not vote any proxies during this reporting period",
             "The Fund did not vote any proxies during this reporting period\n ######################"
         )
         fundVoteData = content.split('######################')
         for alist in fundVoteData:
             try:
                 #print("---------------------------------------------------------------")
                 # print(alist,lstdf)
                 # print(alist)
                 df_all = Format8Parser.tableparsed(alist, olderFundName,
                                                    lstdf, accession_no)
                 # print(df_all)
                 if df_all is not None:
                     dfs_batch = pd.DataFrame()
                     # col_list = df_all.columns
                     # col_list = [(lambda x: re.sub(' +',' ',x))(l) for l in col_list]
                     # df_all.columns = col_list
                     dfs_batch = Format8Parser.formDataFrame(df_all)
                     if dfs_batch.empty != True:
                         if dfs_all.empty == True:
                             # print(dfs_batch)
                             dfs_all = dfs_batch
                         else:
                             #print(dfs_batch)
                             dfs_all = dfs_all.append(dfs_batch,
                                                      ignore_index=True,
                                                      sort=False)
                     if "FundName" in dfs_all.columns:
                         olderFundName = dfs_all["FundName"][0]
                         lstdf = df_all.columns
                     # print(dfs_batch)
                     # print(lstdf)
             except Exception as error:
                 logger.error(
                     "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}"
                     .format(accession_no, error))
                 print(
                     "Exception in for loop of Format8pipeline.process for accessionnumber:{}-{}"
                     .format(accession_no, error))
         #print('------------Final -----------------')
         #print(dfs_all)
         #print("Column names:{}".format(dfs_all.columns))
         if dfs_all is not None:
             dfs_all = Format8Parser.remove_spaces_from_df(dfs_all)
         filename = os.path.join(CONFIG['Path']['output_path_format8'],
                                 accession_no + '.xlsx')
         Utils.df_to_excel(accession_no, dfs_all, filename)
         return 1
     except Exception as error:
         logger.error(
             "Exception in Format8pipeline.process for accessionnumber:{}-{}"
             .format(accession_no, error))
         print(
             "Exception in Format8pipeline.process for accessionnumber:{}-{}"
             .format(accession_no, error))
         return 0