def xing_update(self, xing_names_urls): project_name = 'default' scrapyd_data = {'project': project_name} xing_login = '******' xing_password = '******' for name, url in xing_names_urls.iteritems(): if url != u'' or u'N/A': #scrapyd_data.update(spider=XING_MANUAL_NAME, companies=name, urls=url, # login=xing_login, password=xing_password) #requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "Yes"}, synchronize_session="fetch") session.commit() else: query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "Yes"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit() logger.info("PROBLEMS !!!!") logger.info(name) logger.info(name) logger.info(name)
def update_squirrel_rating(self, companies_names=[]): names = [] websites = [] for name in companies_names: query = session.query(Company.website).filter(Company.name == name) websites.append(query[0][0]) rating_parts = SquirrelRating().calc(companies_names, websites, True) for name in rating_parts.keys(): names.append(name) for name in names: rating_update_info = dict( mx_crm_location_level=rating_parts.get(name).get('location'), mx_crm_branch_level=rating_parts.get(name).get('branch'), mx_crm_google_evaluation=rating_parts.get(name).get( 'google_ev'), mx_crm_wiki_rating_points=rating_parts.get(name).get( 'wiki_size'), mx_crm_xing_rating_points=rating_parts.get(name).get( 'xing_size'), mx_crm_revenue_level=rating_parts.get(name).get( 'revenue_point'), squirrel_rating=rating_parts.get(name).get('score')) query = session.query(Company).filter(Company.name == name) query.update(rating_update_info, synchronize_session=False) session.commit()
def update_squirrel_rating(): companies_names_xing = ['thuega meteringservice gmbh'] for name in companies_names_xing: pprint(name) query_x_url = session.query(XingCompanyDb.company_name_x, XingCompanyDb.xing_url).filter( XingCompanyDb.company_name_x == name, ) try: xing_url = query_x_url[0][1] except IndexError: xing_url = u'' pprint(xing_url) if xing_url != u'': query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) pprint("zazaz") else: query_x_p = session.query( Company.xing_page).filter(Company.name == name) xing_page = query_x_p[0][0] pprint(xing_page) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.xing_url: xing_page}, synchronize_session="fetch") session.commit()
def get_companies_for_wikipedia(companies, force_update): """ Prepares list of companies' names for searching in wikipedia :param companies: List of companies that made requests during specified range :param force_update: force update companies info in database from spiders """ pairs = session.query(Company.name, Company.website).filter( Company.name.in_(companies) & (Company.website is not None) & (Company.website != 'NA')) pairs = {pair[0].lower(): pair[1] for pair in pairs} existing_entries = session.query(WikipediaDb.company_name_w).filter( WikipediaDb.company_name_w.in_(companies)) existing_names = {name[0].lower() for name in existing_entries} res = {} for company in companies: url = pairs.get(company) if not url: continue elif force_update or company in existing_names: company = u'update_{}'.format(company) res[company] = url return res
def get_companies_for_xing(companies, force_update): """ Prepares list of companies' names for searching in xing :param companies: List of companies that made requests during specified range :param force_update: force update companies info in database from spiders """ existing_entries = session.query(XingCompanyDb).join( Company, Company.id == XingCompanyDb.xc_id).filter( Company.name.in_(companies), Company.xing_page != 'NA', Company.xing_page is not None, ) existing_objects_by_name = set( session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x.in_(companies))) to_delete_ids = { c.x_id for c in existing_objects_by_name - set(existing_entries) } if to_delete_ids: session.query(XingCompanyDb).filter( XingCompanyDb.x_id.in_(to_delete_ids)).delete( synchronize_session='fetch') session.commit() existing_names = { entry.company_name_x.lower() for entry in existing_entries } res = set(companies) - existing_names if force_update: res.update({u'update_' + name for name in existing_names}) return res
def _process_google_item(self, item, spider): from sqlalchemy.exc import IntegrityError try: q = session.query(Company).filter( Company.name == item['company_name']) except IntegrityError: q = session.query(Company).filter( Company.name == item['company_name']).first() logging.info( "IIIIITTTTTTTTTTTTTEEEEEEEEEEEEMMMMMMMMMMMM@@@@@@@@@@@@@@@@@@@@") logging.info(item) if q.count() and item['update']: c = q.first() website = 'NA' if c.website: website = c.website elif c.website_long: website = urlparse.urlsplit(c.website_long)[1] if c.manual_entry == 'Yes': q.update({ 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': datetime.now(), 'manual_entry': 'manual', }) logging.info("MANUAL") logging.info("MANUAL") logging.info("MANUAL") logging.info("MANUAL") elif c.manual_entry == 'old': q.update({ 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': datetime.now(), 'manual_entry': 'No' }) session.commit() else: dn = datetime.now() update_item = { 'website': item['url'], 'website_long': item['url_long'], 'website_updated': datetime.now(), 'website_old': website, 'last_update': dn } logging.info(update_item) q.update(update_item) elif not q.count(): new_company = Company(name=item['company_name'], website=item['url'], website_long=item['url_long']) session.add(new_company)
def _log_update(self, log): calc_log = session.query(CalculationsTime).first() if not calc_log: calc_log = CalculationsTime(**log) session.add(calc_log) else: session.query(CalculationsTime).update(log) session.commit()
def total_fields(self, company_names, already_calculated=False): """ Calculate Total Session Length, Total Visited Count per company; """ if not self.timestamps: self.load_last_timestamps() if company_names and not already_calculated: hosts = [ re.sub('\d+$', '0', tup[0]) for tup in session.query(DbIpDatabase.ip_ip).filter( DbIpDatabase.ip_name.in_(company_names)) ] RSV = RecalculateSessionVisited(hosts=hosts) RSV.call_recalculate_per() data = { tup[0]: tup[1:] for tup in session.query( DbIpDatabase.ip_ip, DbIpDatabase.ip_name, DbIpDatabase.ip_country, DbIpDatabase.ip_address, DbIpDatabase. total_session_length, DbIpDatabase.total_visit_count, DbIpDatabase.last_total_update).filter( DbIpDatabase.ip_name.in_(company_names)) } group_by_company_name = {} for key, value in data.items(): timestamp = self.timestamps.get(key) if value[0].lower() in group_by_company_name: current_obj = group_by_company_name.get(value[0].lower()) lv = current_obj.get('last_visited') if timestamp and (lv or datetime.datetime.fromtimestamp( 0)) < datetime.datetime.fromtimestamp(timestamp): current_obj[ 'last_visited'] = datetime.datetime.fromtimestamp( timestamp) if timestamp else '' current_obj['time'] += value[3] current_obj['visited'] += value[4] else: group_by_company_name[value[0].lower()] = { 'country': value[1], 'full_address': value[2], 'time': value[3], 'visited': value[4], 'last_visited': datetime.datetime.fromtimestamp(timestamp) if timestamp else '' } return group_by_company_name
def _accesslogs_query(self, **kwargs): if self.hosts: return session.query(Accesslog).filter( or_(*set([ Accesslog.hostname.like(re.sub('\d+$', '%', host)) for host in self.hosts ]))).order_by(Accesslog.hostname, Accesslog.timestamp).yield_per(1000) elif kwargs.get('timestamp'): return session.query(Accesslog).filter( Accesslog.timestamp >= kwargs.get('timestamp')).order_by( Accesslog.hostname, Accesslog.timestamp).yield_per(1000) return session.query(Accesslog).order_by( Accesslog.hostname, Accesslog.timestamp).yield_per(1000)
def read_account_file_with_id(self, company_names): data = {} ids = [] rows = self.ws.rows columns_names = self._get_account_headers(rows) for name in company_names: query = session.query( Company.name, Company.account_id).filter(Company.name == name) try: ids.append(query[0][1]) except IndexError: continue for row_i, row in enumerate(rows): obj = {} if row_i < 2: continue elif row: for cell_i, cell in enumerate(row): obj[columns_names[cell_i]] = cell.value company_name = obj['Account'] or '' company_name = prepare_company_name_for_match(company_name) p_account_id = obj['Account ID'] or '' if p_account_id and p_account_id in ids: data[p_account_id] = obj elif company_name in company_names: data[company_name] = obj return data
def fixing_wrong_old_wiki(name): query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == name, ) query.update({WikipediaDb.manual_entry: "No"}, synchronize_session="fetch") query.update({WikipediaDb.last_update_w: func.now()}, synchronize_session="fetch") session.commit()
def _get_companies(self, entry_companies_keys): return alchemy_session.query(Company). \ filter( Company.name.in_(entry_companies_keys) & (Company.website != None) & (Company.website != 'NA') ).all()
def process_item(self, item, spider): logging.info("!!!!!!!!!!ITEM!!!!!!!!!!!!") logging.info(item) company_name = item['company_name'] company_website = item['wiki_company_website'] headquarters = item.get('sitz', '')[:50] if item.get('sitz') else None if item.get('wiki_company_website') and len( item['wiki_company_website']) > 130: parsed_url = urlparse.urlparse(item['wiki_company_website']) item['wiki_company_website'] = '{protocol}://{hostname}'.format( protocol=parsed_url.scheme, hostname=parsed_url.hostname) item = dict(summary_wikipedia_w=item['summary'], categories_wikipedia_w=item['categories'], revenue_wikipedia_w=item.get('revenue', ''), revenue_currency_wiki_w=item.get('currency', ''), branch_wikipedia_w=item.get('branche', ''), wiki_url_w=item['company_website'], headquarters_wiki_w=headquarters, employees_wikipedia_w=item.get('mitarbeiter', ''), company_website_w=item.get('wiki_company_website', ''), last_update_w=func.now()) query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == company_name, ) # wiki_company.update(item, synchronize_session='fetch') query.update(item, synchronize_session=False) if query[0].manual_entry == "old": query.update({WikipediaDb.manual_entry: "No"}, synchronize_session="fetch") else: query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit()
def get_google_analytics_sessions(start_time, end_time, google=False): # access_companies = session.query(GoogleAnalyticsVisits.company_name_g).filter( # start_time <= GoogleAnalyticsVisits.visit_date).filter( # GoogleAnalyticsVisits.visit_date <= end_time # ).order_by(GoogleAnalyticsVisits.company_name_g) # access_companies = session.query(GoogleAnalyticsVisits.company_name_g).filter( # start_time <= GoogleAnalyticsVisits.visit_date).filter( # GoogleAnalyticsVisits.visit_date <= end_time) # returned_list = [] # for i in access_companies: # print(i[0]) # google_analytics_companies = {} # for access in access_companies: # query = session.query(Company).filter(Company.id == access.c_id) # google_analytics_companies.update({access: query[0]}) # # if google: # return access_companies # else: # return google_analytics_companies total_session = session.query(GoogleAnalyticsVisits.company_name_g).filter( start_time <= GoogleAnalyticsVisits.visit_date).filter( GoogleAnalyticsVisits.visit_date <= end_time) total = [] for i in total_session: total.append(i[0]) return total
def mass_evaluation(self): project_name = 'default' scrapyd_data = {'project': project_name} force_update = True query = session.query(Company.name).filter( Company.manual_entry == "Yes", ) query.update({Company.manual_entry: "manual"}, synchronize_session="fetch") session.commit() companies = [] for name in query: name = u'update_{}'.format(name[0].lower()) companies.append(name) #companies = q.get_companies_for_google_search(companies_names, force_update) #companies = SPLITTER.join(companies) logger.debug(companies) scrapyd_data.update(spider=GOOGLE_NAME, companies=companies) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("goggle")) time.sleep(5) else: time.sleep(10) break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.sync_all()
def main(): """ looks up the maximum timestamp of squirrel and imports the data since then from drupal accesslog, make sure you are not connected to the T-mobile stick or the database connection to drupal will fail """ logger.info("Start synchronize accesslogs.") start_time = time.time() logger.info("Get max current timestamp.") local_accesslog = session.query(func.max(Accesslog.timestamp)).first() local_accesslog = local_accesslog[0] if local_accesslog else None if not local_accesslog: return logger.info("Get all new accesslogs.") drupal_accesslogs = drupal_session.query(Accesslog).filter( Accesslog.timestamp > local_accesslog) logger.info("Build bulk insert query.") session.bulk_insert_mappings(Accesslog, [ dict(aid=i.aid, sid=i.sid, title=i.title, path=i.path, url=i.url, hostname=i.hostname, uid=i.uid, timer=i.timer, timestamp=i.timestamp) for i in drupal_accesslogs ]) session.commit() logger.info("Data loaded in %s seconds. Count: %s" % (str(time.time() - start_time), drupal_accesslogs.count()))
def fixing_wrong_old(name): query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == name, ) query.update({XingCompanyDb.manual_entry: "No"}, synchronize_session="fetch") query.update({XingCompanyDb.last_update_x: func.now()}, synchronize_session="fetch") session.commit()
def get_zero_website_visits(): companies = session.query(Company.name).filter(Company.website_visit == 0) pprint(companies) return_names = [] for i in companies: return_names.append(i[0]) return return_names
def gg(): query = session.query( Company.name).filter(Company.source == 'Excel Import') c = 0 for q in query: pprint(q[0]) c += 1 pprint(c)
def update_db_hosts(): ips = session.query(DbIpDatabase) logger.info( 'Starting update IPs ({}) from 255.255.255.255 to 255.255.255.0'. format(ips.count())) for ip in ips: ip.ip_ip = ip_digits(ip.ip_ip) session.commit()
def _prepare_company_name(self, company): company_name = company.replace('update_', '') c = session.query(Company.name, Company.impressum_name).filter( Company.name == company_name, Company.impressum_name is not None).first() if c: company_name = c.impressum_name or company_name return company_name
def get_zero_website_visit_xing(): zero = get_zero_website_visits() xing_names = [] for i in zero: companies = session.query(XingCompanyDb.company_name_x).filter( XingCompanyDb.company_name_x == i) xing_names.append(i) pprint(i) pprint(len(xing_names))
def get_websites_wikipedia(companies=[]): websites = [] for i in companies: try: web = session.query(Company.website).filter(Company.name == i) websites.append(web[0][0]) except: continue return websites
def get_manual_wikipedia_companies(): companies = session.query( WikipediaDb.company_name_w).filter(WikipediaDb.manual_entry == "old") return_names = [] for i in companies: return_names.append(i[0]) return return_names
def update_old(self): get_old_wikipedia_companies() time.sleep(10) query = session.query(WikipediaDb.company_name_w, WikipediaDb.wiki_url_w).filter( WikipediaDb.manual_entry == "old", ) print(query) existing_names = [] existing_urls = [] for name in query: existing_names.append(name[0]) existing_urls.append(name[1]) project_name = 'default' scrapyd_data = {'project': project_name} import os s_file = sys.argv logger.info(s_file) dict_names_urls = dict(zip(existing_names, existing_urls)) for name, url in dict_names_urls.iteritems(): if url == u'NA': fixing_wrong_old_wiki(name) elif url == u'N/A': fixing_wrong_old_wiki(name) elif url == u'': fixing_wrong_old_wiki(name) elif url is None: logger.info(url) logger.info(name) fixing_wrong_old_wiki(name) else: # scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies='BKK Demag Krauss-Maffei', urls='www.bkk-dkm.de') scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=name, urls=url) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) #scrapyd_data.update(spider=WIKIPEDIA_MANUAL_NAME, companies=dict_names_urls.keys(), urls=dict_names_urls.values()) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: resp = get_scrapyd_jobs(project_name) try: if len(resp['pending']) or len(resp['running']): logger.debug('{} spider still working'.format("wikipedia")) time.sleep(5) logger.info(resp) else: time.sleep(10) break except KeyError: if resp['status'] == u'error': time.sleep(5) logger.info(resp) logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.wiki_sync() RatingUpdate().update_squirrel_rating(existing_names)
def mass_update(self, company_name, xing_login, xing_password, new_xing_url): xing_url = new_xing_url f = open("mx_crm/manual_queries/xing_url.txt", "w") f.write(xing_url) f.close() print('*' * 50) print('Start updating xing info for company {}'.format(company_name)) query = session.query(XingCompanyDb).filter( XingCompanyDb.company_name_x == company_name, ) query.update({XingCompanyDb.manual_entry: "ololo"}, synchronize_session="fetch") query.update({XingCompanyDb.xing_url: new_xing_url}, synchronize_session="fetch") session.commit() print('*' * 50) project_name = 'default' scrapyd_data = {'project': project_name} decode_company_name = u'{}'.format(company_name.decode('utf-8')) print decode_company_name company_name_lower = u'update_{}'.format( decode_company_name[0].lower()) update_company_name = company_name_lower + decode_company_name[1:] print(update_company_name) companies_names = [] force_update = True companies_names.append(decode_company_name.lower()) print('Start parsing given xing url {}'.format(xing_url)) companies = q.get_companies_for_xing(companies_names, force_update) companies = SPLITTER.join(companies) scrapyd_data.update(spider=XING_NAME, companies=companies, login=xing_login, password=xing_password) requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break requests.post(SCRAPYD_SCHEDULE_URL, scrapyd_data) while True: from mx_crm.utils import get_scrapyd_jobs resp = get_scrapyd_jobs(project_name) if not len(resp['finished']): time.sleep(3) else: break logger.info('Updating resources...') from mx_crm.synchronizers.resource_sync import ResourceSync RS = ResourceSync() RS.xing_sync()
def get_dup(): names = session.query(WikipediaDb.company_name_w).all() counter = list(range(0, len(names))) cleaned_names = [] for i in counter: cleaned_names.append(names[i][0]) import collections dup_names = [] dup_names.append([ item for item, count in collections.Counter(cleaned_names).items() if count > 1 ]) for name in dup_names[0]: query = session.query( WikipediaDb.company_name_w, WikipediaDb.wi_id).filter(WikipediaDb.company_name_w == name) pprint(query[0])
def x(): start_time = datetime.datetime(2019, 2, 25) end_time = datetime.datetime(2019, 2, 28) total_session = session.query(GoogleAnalyticsVisits.company_name_g).filter( start_time <= GoogleAnalyticsVisits.visit_date).filter( GoogleAnalyticsVisits.visit_date <= end_time) for i in total_session: print(i[0])
def set_wikipedia_manual_entry_manual(companies=[]): for i in companies: try: query = session.query(WikipediaDb).filter( WikipediaDb.company_name_w == i, ) query.update({WikipediaDb.manual_entry: "manual"}, synchronize_session="fetch") session.commit() except: continue
def calc(self, company): employees = "" # list for employees count (strings) employees_int = 0 # list for employees count (ints) size_point = 0 dict_index_error_return = {0: 0} result_size_points = 0 result = dict() # get list with <class 'sqlalchemy.util._collections.result'> object query = session.query(WikipediaDb.employees_wikipedia_w).filter( WikipediaDb.company_name_w == company, WikipediaDb.employees_wikipedia_w is not None, WikipediaDb.employees_wikipedia_w != '') # convert with <class 'sqlalchemy.util._collections.result'> object to string and add it to the list of strings try: employees = str(query[0]) except IndexError as e: return 0 # return dict_index_error_return regexed_size = re.search( '\d+', employees) # get digital value of employees count with regex employees_int = int(regexed_size.group(0)) if employees_int <= 10: size_point = 1 result_size_points = size_point elif 10 < employees_int <= 50: size_point = 1.1 result_size_points = size_point elif 50 < employees_int <= 200: size_point = 1.2 result_size_points = size_point elif 200 < employees_int <= 500: size_point = 1.5 result_size_points = size_point elif 500 < employees_int <= 1000: size_point = 4 result_size_points = size_point elif 1000 < employees_int <= 5000: size_point = 8 result_size_points = size_point elif 5000 < employees_int <= 10000: size_point = 9.5 result_size_points = size_point elif employees_int > 10000: size_point = 10 result_size_points = size_point else: size_point = 0 result_size_points = size_point return result_size_points