def date(xentry, name, parsed): """ insert a date-formated element into the entry """ if not parsed: return formatted = time.strftime("%Y-%m-%dT%H:%M:%SZ", parsed) xdate = createTextElement(xentry, name, formatted) formatted = time.strftime(config.date_format(), parsed) xdate.setAttribute('planet:format', formatted.decode('utf-8'))
def bridger_crawling(para, full_name, operator, case_name): """ :param para: Company name :param full_name: Full name of current user :return: Evidence if found, otherwise NOT_FOUND """ logger.info("BridgerInsight Helper: " + str(para) + " " + str(full_name)) keyword = para # MARK para = para.replace('/', '_') username, password = config.get_bridger_account(operator) evidence = dict() url = bridger_url(username, password, para) if url!="unicode detected!!!!!!": crawler = None try: req = urllib2.Request(url) res = urllib2.urlopen(req) crawler = res.read() except urllib2.URLError, e: logger.error("Error while crawling: " + str(e)) if hasattr(e, "code"): logger.error("Code: " + str(e.code)) if hasattr(e, "reason"): logger.error("Reason: " + str(e.reason)) return NOT_FOUND if crawler == NOT_FOUND: logger.warning(NOT_FOUND) return NOT_FOUND # logger.info("BridgerInsight Result: " + str(crawler)) config.mkdirs(config.local_bridge_folder + config.date_format(config.get_today())) name_orig = para.replace('/','_') para = (case_name + '_' + para).replace('/', '_') item=json.loads(crawler) if item['Records'] is None or item["Records"]["ResultRecord"][0]["Watchlist"]["Matches"] is None: bridger_no_result(operator, name_orig, bridger_pdf_local(para)) evidence["type"] = "bridgerInsight" evidence["url"] = config.current_url + bridger_pdf_web(para) evidence["missing"] = False evidence["name"] = keyword logger.info("Evidence: " + str(evidence)) return evidence item['full name'] = operator crawler = json.dumps(item) name = para.replace(" ", "_") fileObject = open(name + "_bridge.json", 'w') fileObject.write(str(crawler)) fileObject.close() exists = bridger_insight_parser(name + "_bridge.json", name + "_bridge_temp.json") if not exists: return NOT_FOUND print "pdf target======" print bridger_pdf_local(name) try: buildPdf(bridger_pdf_local(name), name + "_bridge_temp.json") # buildPdf(bridger_pdf_target(name, case_name), name + "_bridge_temp.json") if (os.path.isfile(bridger_pdf_local(name))): print "File exists and will start copy BRIDGERINSIGHT!!!!" shutil.copy(bridger_pdf_local(name), bridger_pdf_target(name, case_name)) except: logger.error("Failed to generate PDF for:" + str(name)) logger.error(traceback.format_exc()) return NOT_FOUND abstract_data = [] hit_flag = False with open(name + "_bridge.json") as data_file: ab_data = json.load(data_file) for temp_item in ab_data['Records']['ResultRecord']: wl = temp_item['Watchlist']['Matches']['WLMatch'] for index, wl_item in enumerate(wl): if wl_item['EntityDetails'] is None and wl_item["CountryDetails"] is not None: entry = {} entry['EntityScore'] = wl_item['BestCountryScore'] if int(entry['EntityScore']) == 100: hit_flag = True entry['FileName'] = wl_item['File']['Name'][:-4] entry['Entity Name'] = wl_item['EntityName'] entry["index"] = index + 1 abstract_data.append(entry) elif wl_item["EntityDetails"] is not None and wl_item["CountryDetails"] is None: entry = {} entry['EntityScore'] = wl_item['EntityScore'] if int(entry['EntityScore']) == 100: hit_flag = True entry['FileName'] = wl_item['File']['Name'][:-4] entry['Entity Name'] = wl_item['EntityName'] entry["index"] = index + 1 abstract_data.append(entry) target_json_folder = config.target_folder + case_name + "/bridgerJson" # 1 Create a folder which will contain JSON files for every user. if not os.path.exists(target_json_folder): os.makedirs(target_json_folder) # 2 Check whether there is such JSON file that matches bocID.json if os.path.isfile(bridger_temp_json_target(name, case_name)) and os.access(bridger_temp_json_target(name, case_name), os.R_OK): print "File exists and is readable" else: print "Either file is missing or is not readable" os.mknod(bridger_temp_json_target(name, case_name)) print "Will check whether json exists" print os.path.isfile(bridger_temp_json_target(name, case_name)) and os.access(bridger_temp_json_target(name, case_name), os.R_OK) # 3 Write userInfo into that JSON with open(bridger_temp_json_target(name, case_name), "w") as fileObject_1: json.dump(abstract_data, fileObject_1, indent=4, ensure_ascii=False) if (os.path.isfile(name + "_bridge_temp.json")): print "Will COPY original Bridger data." shutil.copy((name + "_bridge_temp.json"), bridger_json_target(name, case_name)) # os.system("rm " + name + "_bridge.json " + name + "_bridge_temp.json") os.remove(name + "_bridge.json") os.remove(name + "_bridge_temp.json") evidence["type"] = "bridgerInsight" evidence["url"] = config.current_url + bridger_pdf_web(name) evidence["missing"] = False evidence["name"] = keyword evidence["isHit"] = hit_flag logger.info("Evidence: " + str(evidence)) return evidence
def dowjones_crawling(item, operator, case_name): """ :param item: Company name :return: List of PDF URL """ # item = urllib.quote(item.encode('utf8'), ':/') logger.info("Dowjones item: " + str(item)) username, password = config.get_dowjones_account(operator) # Mark item = item.replace('/', '_') craw_str = dowjones_url(username, password, item) if craw_str!="unicode detected!!!!!!": print craw_str logger.debug(craw_str) name_item = config.filename_format(item) ################ # stored_file = config.local_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + ".pdf" # if os.path.exists(stored_file): # logger.debug("Downloaded PDF File Already Exists!") # entry_file = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + ".pdf" # return [entry_file] ########## stored_file = config.local_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + "_Dowjones_summary.pdf" result = [] # if os.path.exists(stored_file): # logger.debug("Downloaded PDF File Already Exists!") # entry_file = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + "_Dowjones_summary.pdf" # # pdf_local_path = config.local_dowjones_folder + config.date_format(config.get_today()) + "/" # # entry_path = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" # # result.append(entry_file) # # pdf_file = os.listdir(pdf_local_path) # for file in pdf_file: # if name_item + "_companyDetail_" in file and file.startswith(name_item): # result.append(entry_path+file) # return result crawler = None try: req = urllib2.Request(craw_str) res = urllib2.urlopen(req) crawler = res.read() except urllib2.URLError, e: logger.error("Error while crawling: " + str(e)) if hasattr(e, "code"): logger.error("Code: " + str(e.code)) if hasattr(e, "reason"): logger.error("Reason: " + str(e.reason)) return NOT_FOUND if not crawler or crawler == NOT_FOUND: logger.warning(NOT_FOUND) return NOT_FOUND print "DowJones Result", crawler crawler = json.loads(crawler) if str(crawler['urls'][0]) == '': logger.warning(NOT_FOUND) return NOT_FOUND result = [] # if "found" in crawler and crawler["found"]: logger.debug("Dow Jones Crawling Result: " + str(crawler)) for craw_item in crawler['urls']: name = craw_item['link'] name = name.split('/') name = name[len(name) - 1] crawler_path = config.crawler_url + config.dowjones_pdf_folder + name local_dir = config.local_dowjones_folder + config.date_format(config.get_today()) config.mkdirs(local_dir) # local = local_dir + "/" + name_item + ".pdf" local = local_dir + "/" + name target_folder = config.target_folder + case_name + "/originalEvidence" print "target_folder========" print target_folder if not os.path.exists(target_folder): print "target_folder doesnt exist!!" os.makedirs(target_folder) target = target_folder + "/" + name logger.info("remote path: " + crawler_path) logger.info("local path: " + local) crawler_path = urllib.quote(crawler_path.encode('utf8'), ':/') urllib.urlretrieve(crawler_path, local) # urllib.urlretrieve(crawler_path, target) if (os.path.isfile(local)): print "File exists and will start copy DOWJONES" shutil.copy(local, target) ret_path = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name # ret_path = config.current_url + config.web_dowjones_folder + config.date_format(config.get_today()) + "/" + name_item + ".pdf" logger.info("Return Path" + ret_path) temp = ret_path result.append({'origin': craw_item['origin'], 'link': temp}) # if not crawler["found"]: # break # else: # logger.warning(NOT_FOUND) # return NOT_FOUND logger.info("Dowjones Result: " + str(result)) return result
def bridger_pdf_web(para): return config.web_bridge_folder + config.date_format(config.get_today()) + "/" + config.create_pdf_name(para)
if hasattr(e, "reason"): logger.error("Reason: " + str(e.reason)) return NOT_FOUND if crawler is None or crawler == NOT_FOUND: logger.warning("No crawler results: " + str(crawler_url)) return NOT_FOUND logger.debug("Google Crawler Result: " + str(crawler)) store_path = config.local_google_folder res_folder = os.path.exists(store_path) if not res_folder: os.system('mkdir ' + store_path) out_pdf = item.replace(' ', '_') + '.pdf' local = config.local_google_folder + config.date_format(config.get_today()) + "/" config.mkdirs(local) local = local + out_pdf target_folder = config.target_folder + case_name + "/originalEvidence" print "target_folder========" print target_folder if not os.path.exists(target_folder): print "target_folder doesnt exist!!" os.makedirs(target_folder) target = target_folder + "/" + out_pdf full_url = config.crawler_url + config.google_pdf_folder + craw_pdf logger.info("Crawling: " + full_url + " " + local) logger.info("Crawling:================ " + full_url + " " + target) urllib.urlretrieve(full_url, local) # urllib.urlretrieve(full_url, target)