def _process_google_item(self, item, spider): from sqlalchemy.exc import IntegrityError try: q = session.query(Company).filter( Company.name == item['company_name']) except IntegrityError: q = session.query(Company).filter( Company.name == item['company_name']).first() logging.info( "IIIIITTTTTTTTTTTTTEEEEEEEEEEEEMMMMMMMMMMMM@@@@@@@@@@@@@@@@@@@@") logging.info(item) if q.count() and item['update']: c = q.first() website = 'NA' if c.website: website = c.website elif c.website_long: website = urlparse.urlsplit(c.website_long)[1] if c.manual_entry == 'Yes': q.update({ 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': datetime.now(), 'manual_entry': 'manual', }) logging.info("MANUAL") logging.info("MANUAL") logging.info("MANUAL") logging.info("MANUAL") elif c.manual_entry == 'old': q.update({ 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': datetime.now(), 'manual_entry': 'No' }) session.commit() else: dn = datetime.now() update_item = { 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': dn } logging.info(update_item) q.update(update_item) elif not q.count(): new_company = Company(name=item['company_name'], website=item['url'], website_long=item['url_long']) session.add(new_company)
def _log_update(self, log): calc_log = session.query(CalculationsTime).first() if not calc_log: calc_log = CalculationsTime(**log) session.add(calc_log) else: session.query(CalculationsTime).update(log) session.commit()
def log_start(self, type, description='', additional_data=''): le = LogExecutions( type=type, description=description, start_datetime=datetime.now(), additional_data=additional_data) session.add(le) session.commit() self.current_session = le
def _process_evaluation_item(self, item, spider): q = session.query(DbGoogleEvaluation).filter( DbGoogleEvaluation.g_company_website == item['company_website'], DbGoogleEvaluation.g_search_word == item['search_word']) if q.count() and item['update']: q.update({ 'g_found_result': item['found_result'], 'g_search_url': item['search_url'], 'g_last_update': datetime.fromtimestamp(item['last_update']) }) elif not q.count(): new_google_ev = DbGoogleEvaluation( g_company_website=item['company_website'], g_search_word=item['search_word'], g_found_result=int(item['found_result']), g_search_url=item['search_url'], g_last_update=datetime.fromtimestamp(item['last_update']), g_timestamp=datetime.fromtimestamp(item['timestamp'])) session.add(new_google_ev)
def process_item(self, item, spider): logging.info("!!!!!!!!!!ITEM!!!!!!!!!!!!") logging.info(item) update = item.get('update') company_name = item.get('company_name') xing_page_url = item.get('xing_page_url') impressum_url = item.get('impressum_url') description = item.get('about_us')[:8000] if item.get( 'about_us') else None if item.get('partial_update'): item = dict(street_xing='', city_xing='', description_xing='', zipcode_xing='', country_xing='', tel_xing='', fax_xing='', company_email_xing='', industry_xing='', established_in_xing=None, products_xing='', employees_size_xing='', company_website_x='N/A', last_update_x=func.now(), employees_group_xing_x='') else: item = dict( street_xing=item.get('street'), city_xing=item.get('city'), description_xing=description, zipcode_xing=item.get('postal_code'), country_xing=item.get('country'), tel_xing=item.get('phone'), fax_xing=item.get('fax'), company_email_xing=item.get('email'), industry_xing=item.get('industry'), established_in_xing=item.get('established'), products_xing=item.get('products'), employees_size_xing=item.get('employees_number'), company_website_x=item.get('url'), last_update_x=func.now(), employees_group_xing_x=item.get('registered_employees_number')) company = session.query(Company).filter_by(name=company_name).first() if not company: return if update: #company = company.filter(Company.xing_page != 'NA', Company.xing_page is not None).first() session.query(XingCompanyDb).filter( XingCompanyDb.xc_id == company.id).update( item, synchronize_session=False) else: new_entry = XingCompanyDb(company_name_x=company_name, timestamp_x=func.now(), xc_id=company.id, **item) session.add(new_entry) company.last_update = func.now() company.xing_page_update = func.now() company.xing_page = xing_page_url company.impressum_link = impressum_url
def process_item(self, item, spider): logging.info("!!!!!!!!!!ITEM!!!!!!!!!!!!") logging.info(item) logging.info(spider) update = item['update'] company_name = item['company_name'] company_name = company_name.decode("utf-8") logging.info('PIPELINE COMPANY NAME') logging.info(company_name) company_website = item['company_website'] headquarters = item.get('sitz', '')[:50] if item.get('sitz') else None manual_update_item = {} if item.get('wiki_company_website') and len( item['wiki_company_website']) > 130: parsed_url = urlparse.urlparse(item['wiki_company_website']) item['wiki_company_website'] = '{protocol}://{hostname}'.format( protocol=parsed_url.scheme, hostname=parsed_url.hostname) if item.get('partial_update'): item = dict(summary_wikipedia_w='', categories_wikipedia_w='', revenue_wikipedia_w='', revenue_currency_wiki_w='', branch_wikipedia_w='', wiki_url_w='N/A', headquarters_wiki_w='', employees_wikipedia_w='', company_website_w='', last_update_w=func.now()) logging.info('PIPELINE ITEM DICT 1') logging.info(item) else: item = dict(summary_wikipedia_w=item['summary'], categories_wikipedia_w=item['categories'], revenue_wikipedia_w=item.get('revenue', ''), revenue_currency_wiki_w=item.get('currency', ''), branch_wikipedia_w=item.get('branche', ''), wiki_url_w=item['url'], headquarters_wiki_w=headquarters, employees_wikipedia_w=item.get('mitarbeiter', ''), company_website_w=item.get('wiki_company_website', ''), last_update_w=func.now()) logging.info('PIPELINE ITEM DICT 2') logging.info(item) manual_update_item = dict( summary_wikipedia_w=item['summary'], categories_wikipedia_w=item['categories'], revenue_wikipedia_w=item.get('revenue', ''), revenue_currency_wiki_w=item.get('currency', ''), branch_wikipedia_w=item.get('branche', ''), headquarters_wiki_w=headquarters, employees_wikipedia_w=item.get('mitarbeiter', ''), company_website_w=item.get('wiki_company_website', ''), last_update_w=func.now()) company = session.query(Company).filter_by(name=company_name, website=company_website) logging.info('PIPLINE COMPANY 1') logging.info(company) if not company.count(): company = session.query(Company).filter_by(name=company_name) logging.info('PIPLINE COMPANY 2') logging.info(company) company = company.first() logging.info('PIPLINE COMPANY first') logging.info(company) wiki_company = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name) new_entry = WikipediaDb(company_name_w=company_name, timestamp_w=func.now(), wc_id=company.id, **item) if update and wiki_company.count() and ( not company.is_wiki_manualy_u or spider.is_manual_update_wiki): if wiki_company[0].manual_entry == "Yes": wiki_company.update(manual_update_item, synchronize_session=False) elif wiki_company[0].manual_entry == "manual": wiki_company.update(manual_update_item, synchronize_session=False) elif wiki_company[0].manual_entry == "confirmed": wiki_company.update(manual_update_item, synchronize_session=False) else: wiki_company.update(item, synchronize_session=False) elif not wiki_company.count(): session.add(new_entry) if not company.is_wiki_manualy_u or spider.is_manual_update_wiki: company.is_wiki_manualy_u = True company.last_update = func.now() company.wiki_evaluation = func.now() company.wikipedia_url = item['wiki_url_w']
def create_report(companies, account_data=[], account_headers=[], total_fields=[], data_links={}, google_analytics_companies={}, dates={}): """ Creates and saves locally report. :param companies: List of companies that made requests during specified range """ logger.debug(companies) file_name = settings.REPORTS_FILE.format(now=datetime.datetime.now().strftime("%Y_%m_%d-%H_%M_%S")) path_to_xl = settings.rel('mx_crm', settings.REPORTS_FOLDER, file_name) logger.debug('Export excel file: {}'.format(path_to_xl)) wb = Workbook() ws = wb.create_sheet('Report') logger.info('Saving report to the local excel file') wb_headers = settings.NEW_WORKBOOK_HEADERS # wb_headers = settings.WORKBOOK_HEADERS if account_headers: wb_headers += account_headers if total_fields: wb_headers += settings.TOTAL_HEADERS wb_headers += settings.RATING_HEADERS ws.append(wb_headers) companies_info = get_companies_info(companies) logger.info('companies_info') logger.info(companies_info) companies_info_manual_id = get_company_table_info(companies) logger.info('companies_info_manual_id') logger.info(companies_info_manual_id) # manual companies_info_websites = get_companies_info_websites(companies) logger.debug('Companies: {}'.format(len(companies_info))) companies_wiki_info = get_wiki_info(companies) logger.debug('Wiki companies: {}'.format(len(companies_wiki_info))) companies_xing_info = get_xing_info(companies) logger.debug('Xing companies: {}'.format(len(companies_xing_info))) companies_names = set() websites_for_rating = set() for c in companies_info.values(): if c.website: websites_for_rating.add(c.website) if c.name: companies_names.add(c.name) rating_data = SquirrelRating().calc(companies=companies_names, websites=websites_for_rating) company_manual_account = get_manual_account(companies_names) variables_data = SquirrelRating().get_rating_variables(companies, websites_for_rating) #logger.info("rating data {}".format(rating_data)) #logger.info("rating data {}".format(type(rating_data))) try: counter = 0 for company_name, company in sorted(companies.items(), key=lambda x: x[1].session_length, reverse=True): ws.row_dimensions[counter].collapsed = True address = company.full_address country = company.country # rating = rating_data.get(company.company_name).get('score') wiki_info = companies_wiki_info.get(company_name) xing_info = companies_xing_info.get(company_name) company_info = companies_info.get(company_name) company_table_manual_id = companies_info_manual_id.get(company_name) website = company_info.website if company_info else '' full_website = re.sub('www\d?\.', '', website).rstrip('/').lower() prepared_company_name = company_name xing_page = company_info.xing_page if company_info else None session_length = company.session_length for session in company.sessions: for request in session.requests: #master_company = alchemy_session.query(Company.name).filter(Company.name == company.company_name) access_history = MxCrmAccessHistory( company_name=company.company_name, a_h_sid=counter, mx_crm_visited_page=request.title, mx_crm_referrer=request.url[:255], mx_crm_session_date=datetime.datetime.fromtimestamp(int(request.timestamp)).strftime( '%Y-%m-%d'), mx_crm_session_time=datetime.datetime.fromtimestamp(int(request.timestamp)).strftime( '%H:%M:%S'), mx_crm_ip_vlan=request.hostname ) alchemy_session.add(access_history) alchemy_session.commit() sheet_counter = 2 company_table_info = get_manual_website(company.company_name) access_dt = datetime.datetime.fromtimestamp(request.timestamp).strftime('%Y-%m-%d %H:%M:%S') rcd_name_rating = companies_info.get(company_name) if rcd_name_rating and rcd_name_rating.name: rating = rating_data.get(rcd_name_rating.name, 'N/C') if rating_data.get( rcd_name_rating.name) is not None else 'N/C' if company_name in total_fields: obj = total_fields.get(company_name, {}) total_session_lenght = datetime.timedelta(seconds=obj.get('time') or 0) # row = [company.company_name] sheet_number = 'A{}'.format(sheet_counter) # ws[sheet_number].hyperlink = "http://google.com" # ws[sheet_number].value = company.company_name # ws.cell(row=1, column=sheet_counter).value = '=HYPERLINK("{}", "{}")'.format('google.com', company.company_name) link = '' # pprint(company.company_name) link = data_links.get(company.company_name.lower()) c_id = alchemy_session.query(Company.id).filter(Company.name == company.company_name) try: company_id = c_id[0][0] webinterface_link = "http://192.168.0.141:8000/squirrel/accounts/{}/".format(company_id) except IndexError: company_id = '' webinterface_link = "http://192.168.0.141:8000/squirrel/accounts/search/{}/".format(company.company_name) # pprint(link) query_link = alchemy_session.query(Company).filter(Company.name == company.company_name) query_link.update({Company.d_crm_link: link}, synchronize_session="fetch") alchemy_session.commit() row = ['=HYPERLINK("{}", "{}")'.format(webinterface_link, company.company_name), company_table_info.get(company.company_name), website, session_length, total_session_lenght, rating_data.get(company.company_name), address, request.title, request.url, access_dt, country] sheet_counter += 1 # pprint(type(row)) if wiki_info: row.extend([ wiki_info.manual_entry, wiki_info.wiki_url_w, convert_to_float(wiki_info.revenue_wikipedia_w), wiki_info.revenue_currency_wiki_w, convert_to_int(wiki_info.employees_wikipedia_w), wiki_info.categories_wikipedia_w, wiki_info.branch_wikipedia_w, wiki_info.summary_wikipedia_w, ]) else: row.extend([''] * 8) if xing_info: if company_table_manual_id.manual_account_id: c_t_manual_id = company_table_manual_id.manual_account_id elif company_table_manual_id.manual_account_id == u'': c_t_manual_id = u'NONE' elif company_table_manual_id.manual_account_id == '': c_t_manual_id = u'NONE' else: c_t_manual_id = u'NONE' row.extend([ xing_info.manual_entry, xing_page, xing_info.country_xing, xing_info.employees_group_xing_x, xing_info.employees_size_xing, xing_info.description_xing, xing_info.industry_xing, c_t_manual_id # company_manual_account.get(company_name) ]) else: row.extend([''] * 8) if full_website in account_data or prepared_company_name in account_data: data_to_extend = [] for key in account_headers: if full_website in account_data: value = account_data[full_website].get(key, '') else: value = account_data[prepared_company_name].get(key, '') data_to_extend.append(value) row.extend(data_to_extend) elif account_headers: row.extend([''] * len(account_headers)) if company_name in total_fields: obj = total_fields.get(company_name, {}) row.extend([ datetime.timedelta(seconds=obj.get('time') or 0), convert_to_int(obj.get('visited')), obj.get('last_visited'), ]) else: row.extend([''] * len(settings.TOTAL_HEADERS)) rcd_name = companies_info.get(company_name) if rcd_name and rcd_name.name: if wiki_info: row.extend([ wiki_info.manual_entry ]) else: row.extend([""]) if xing_info: row.extend([ xing_info.manual_entry ]) else: row.extend([""]) query = alchemy_session.query(Company).filter(Company.name == rcd_name.name) dict_for_save = dict(mx_crm_location_level=variables_data.get(rcd_name.name).get('location'), mx_crm_branch_level=variables_data.get(rcd_name.name).get('branch'), mx_crm_google_evaluation=variables_data.get(rcd_name.name).get( 'google_ev'), mx_crm_wiki_rating_points=variables_data.get(rcd_name.name).get( 'wiki_size'), mx_crm_xing_rating_points=variables_data.get(rcd_name.name).get( 'xing_size'), mx_crm_revenue_level=variables_data.get(rcd_name.name).get( 'revenue_point')) rating_update_info = dict( mx_crm_location_level=variables_data.get(rcd_name.name).get('location'), mx_crm_branch_level=variables_data.get(rcd_name.name).get('branch'), mx_crm_google_evaluation=float(variables_data.get(rcd_name.name).get('google_ev')), mx_crm_wiki_rating_points=variables_data.get(rcd_name.name).get('wiki_size'), mx_crm_xing_rating_points=variables_data.get(rcd_name.name).get('xing_size'), mx_crm_revenue_level=variables_data.get(rcd_name.name).get('revenue_point')) query.update(rating_update_info, synchronize_session=False) relation_ship_type = row[36] account_name = row[27] account_owner = row[28] abc_rating = row[38] closed_activity_type = row[31] if row[32] != '': closed_date = row[32] else: closed_date = None # closed_date = datetime.datetime.strptime(str(row[32]), '%m/%d/%Y %H:%M:%S') open_activity_type = row[33] if row[34] != '': schedule_date = row[34] else: schedule_date = None # schedule_date = datetime.datetime.strptime(str(row[34]), '%m/%d/%Y %H:%M:%S') total_session_length = row[39] total_visited_page = row[40] last_visit_time = row[41] alchemy_session.commit() dynamics_crm_info = dict(d_crm_relationship_type=relation_ship_type, d_crm_account_name=account_name, d_crm_account_owner=account_owner, d_crm_abc_rating=abc_rating, d_crm_closed_activity_type=closed_activity_type, d_crm_open_activity_type=open_activity_type, d_crm_closed_date=closed_date, d_crm_schedule_date=schedule_date, mx_crm_total_session_length=total_session_length, mx_crm_total_visited_pages=total_visited_page, mx_crm_last_visit=last_visit_time, squirrel_rating=rating_data.get(rcd_name.name)) #webinterface_link=webinterface_link) # also in this query save webinterface link query_dynamics_crm = alchemy_session.query(Company).filter(Company.name == rcd_name.name) query_dynamics_crm.update(dynamics_crm_info, synchronize_session=False) alchemy_session.commit() row.extend([ rating_data.get(rcd_name.name, 'N/C') if rating_data.get( rcd_name.name) is not None else 'N/C', ]) row.extend([ variables_data.get(rcd_name.name).get('location') ]) row.extend([ variables_data.get(rcd_name.name).get('branch') ]) row.extend([ variables_data.get(rcd_name.name).get('google_ev') ]) row.extend([ variables_data.get(rcd_name.name).get('wiki_size') ]) row.extend([ variables_data.get(rcd_name.name).get('xing_size') ]) row.extend([ variables_data.get(rcd_name.name).get('revenue_point') ]) else: row.extend(['N/C'] * len(settings.RATING_HEADERS)) try: ws.append(row) except ValueError as e: logger.info(e) counter += 1 if not ws.row_dimensions[counter - 1].collapsed: ws.row_dimensions[counter].hidden = True ws.row_dimensions[counter].outlineLevel = 1 wb.save(path_to_xl) d_start = dates.get('start_date') e_date = dates.get('end_date') start_date = datetime.datetime(d_start.year, d_start.month, d_start.day) end_date = datetime.datetime(e_date.year, e_date.month, e_date.day) # g_a_c = get_google_analytics_sessions(start_date, end_date, True) # logger.info(g_a_c) # logger.info(google_analytics_companies) # result = add_google_analytics_accounts_to_report_file(path_to_xl, start_date, end_date) # os.chdir("C:/Users/admin/PycharmProjects/SquirrelRunnerNew/mx_crm") # cd = os.system('python add_companies.py --days_start={0} --year_start={1} --month_start={2} --days_end={3} --year_end={4} --month_end={5}'.format( # d_start.day, d_start.year, d_start.month, e_date.day, e_date.year, e_date.month # )) # logger.info(cd) except KeyError as e: logger.error(e) logger.info('Local file has been updated')
def get_drupal_sessions(start_time, end_time): """ Extracts request sessions from accesslog table. :param start_time: time to extract requests from :param end_time: time to extract requests to :return: Dictionary with sessions info separated by companies. """ logger.info("Started sessions extraction") timestamp_start_time = (start_time - datetime(1970, 1, 1)).total_seconds() timestamp_end_time = (end_time - datetime(1970, 1, 1)).total_seconds() readable_s = datetime.fromtimestamp(timestamp_start_time) readable_e = datetime.fromtimestamp(timestamp_end_time) access_hosts = session.query( Accesslog.timestamp, Accesslog.hostname, Accesslog.path, Accesslog.url, Accesslog.title ).filter( # between(Accesslog.timestamp, timestamp_start_time, timestamp_end_time), between(Accesslog.timestamp, func.unix_timestamp(start_time), func.unix_timestamp(end_time)), Accesslog.title != 'Generate image style', Accesslog.hostname.notin_(settings.IPS_BLACKLIST)).order_by( Accesslog.hostname, Accesslog.timestamp) accesslog = [Access(*res) for res in access_hosts] blacklist = { tup[0].lower() for tup in session.query(Company.name).filter( Company.type_main.in_(['Blacklist', 'Spam', 'Provider'])) } ips_info = { tup[0]: tup[1:] for tup in session.query(DbIpDatabase.ip_ip, DbIpDatabase.ip_country, DbIpDatabase.ip_name, DbIpDatabase.ip_name_2, DbIpDatabase.ip_address) } res = {} drupal_session = DrupalSession() session_length = 0 len_accesslog = len(accesslog[:-1]) - 1 for index, request in enumerate(accesslog[:-1]): host = ip_digits(request.hostname) access_datetime = datetime.fromtimestamp(int(request.timestamp)) next_request = accesslog[index + 1] next_request_host = ip_digits(next_request.hostname) next_request_access_datetime = datetime.fromtimestamp( int(next_request.timestamp)) difference = next_request_access_datetime - access_datetime is_continue = False if host == next_request_host and difference.seconds < settings.MAXIMUM_DIFFERENCE_BETWEEN_SESSIONS.seconds: session_length += difference.seconds is_continue = True elif host == next_request_host: session_length += settings.LONG_SESSION_DEFAULT is_continue = True elif host != next_request_host: session_length += settings.LONG_SESSION_DEFAULT if index and host == ip_digits( accesslog[index - 1].hostname) and host != next_request_host: drupal_session.append(request) elif host == next_request_host: drupal_session.append(request) is_continue = True if is_continue and index != len_accesslog: continue if host in ips_info: country, company_name, address_result, full_address_result = ips_info[ host] else: country = company_name = address_result = full_address_result = '' try: country, company_name, address_result, full_address_result = get_whois( host) except Exception as e: logger.error( 'get_whois function (RIPE) got an error for host: {}\nError: {}' .format(host, str(e))) continue finally: address_result = address_result[:250] logger.debug(address_result) full_address_result = full_address_result[:350] new_entry = DbIpDatabase(ip_ip=host, ip_country=country, ip_name=company_name, ip_name_2=address_result, ip_address=full_address_result, ip_host=host, ip_timestamp=func.now()) session.add(new_entry) ips_info[host] = (country, company_name, address_result, full_address_result) company_name = company_name.lower() if company_name and country in settings.RELEVANT_COUNTRIES \ and company_name not in settings.PROVIDERS_BLACKLIST \ and company_name not in blacklist \ and not any(word in company_name for word in settings.COMPANIES_BLACKLIST) \ and not any(re.search(regexp, company_name) for regexp in settings.PROVIDERS_BLACKLIST_REGEXPS) \ and not any(re.search(regexp, company_name) for regexp in settings.COMPANIES_BLACKLIST_REGEXPS): if company_name not in res: res[company_name] = CompanyEntry(*ips_info[host], sessions=[]) res[company_name].sessions.append(drupal_session) res[company_name].session_length = timedelta( seconds=session_length) drupal_session = DrupalSession() session_length = 0 session.commit() logger.info('Sessions extraction has been finished successfully.') return res