def handleAnnotation(self, cell, annotation) : """ Create relevant triples for the annotation attached to the cell """ # Create triples according to Open Annotation model annotation_URI = cell['URI'] + "-oa" annotation_body_URI = annotation_URI + '-body' self.graph.add((annotation_URI, RDF.type, OA.Annotation)) self.graph.add((annotation_URI, OA.hasTarget, cell['URI'])) self.graph.add((annotation_URI, OA.hasBody, annotation_body_URI)) self.graph.add((annotation_body_URI, RDF.type, RDFS.Resource)) self.graph.add((annotation_body_URI, TABLINKER.value, Literal(clean_string(getText(annotation))))) # Extract author author = annotation.getElementsByType(dc.Creator) if len(author) > 0: author = clean_string(str(author[0])) self.graph.add((annotation_body_URI, OA.annotatedBy, Literal(author))) # Extract date creation_date = annotation.getElementsByType(dc.Date) if len(creation_date) > 0: creation_date = str(creation_date[0]) self.graph.add((annotation_body_URI, OA.serializedAt, Literal(creation_date, datatype=XSD.date)))
def handleTitle(self, cell) : """ Create relevant triples for the cell marked as Title """ self.graph.add((cell['sheetURI'], RDFS.comment, Literal(clean_string(cell['value']))))
def _create_cell(self, cell, cell_type): """ Create a new cell """ # Set the value value = Literal(clean_string(cell['value'])) # It's a cell self.graph.add((cell['URI'], RDF.type, cell_type)) # It's in the data set defined by the current sheet self.graph.add((cell['URI'], TABLINKER.sheet, cell['sheetURI'])) # Add its value (removed the datatype=XSD.decimal because we can't be sure) self.graph.add((cell['URI'], TABLINKER.value, value)) # Add a cell label label = "%s=%s" % (cell['name'], cell['value']) self.graph.add((cell['URI'], RDFS.label, Literal(label)))
def sequence_gene_filter(list_from_file): gene_acronym = [] gene_sequence = [] gene_seq = '' for line in list_from_file: line_length = len(line) refresh = False if line_length < 10: refresh = True gene_acronym.append(line) else: gene_sequence.append(line) gene_seq += line if refresh: gene_sequence.clear() gene_seq = '' str_seq = '' just_gene_seq = [] for line in gene_sequence: str_seq += clean_string(line) str_seq.upper() just_gene_seq.append(str_seq) return gene_acronym, just_gene_seq
def search_product(data): product = clean_string(data['product']) max_search_length = int(escape(data['max_returned'])) s = search.shops(product, max_search_length) emit('searching start', {'search_length': len(s)}) print(f'Searching for {product}') start_time = time.time() # clear cache if request is more than max_cache_time since previous request global most_recent_cache global cache if start_time - most_recent_cache >= max_cache_time: most_recent_cache = start_time cache = dict() try: emit('search length', f'{len(s)}') for i, shop in enumerate(s): try: # deal with cache - if key exists, assign value without bothering to re-search cache_key = f'{shop.shop_name}.{product}.{shop.max_search_length}' cache_value = cache.get(cache_key) if cache_value is not None: result = cache_value else: if shop.json_selector is not None: try: result = search.search_shop_details_json(shop) except Exception as e: print("oh no!", e) error_message = f"SEARCH FAILED FOR SHOP [{shop.shop_name}] AND PRODUCT [{product}], REVERTING TO DEFAULT" if not (isinstance(e, HTTPError) and shop.shop_name == "ALDI" and e.code == 503): if bot is not None: bot.send_message(error_message, e) print( f"SEARCH FAILED FOR SHOP [{shop.shop_name}] AND PRODUCT [{product}], REVERTING TO DEFAULT" ) print(repr(e)) page_source = search.load_page_source(shop) result = search.search_page_source( page_source, shop) else: page_source = search.load_page_source(shop) result = search.search_page_source(page_source, shop) # update cache with new value for shop/product/max combination cache[cache_key] = result emit( 'result', { 'shop_name': shop.shop_name, 'result': result, 'shop_number': i + 1 }) socketio.sleep( ) # Without this sleep, the app batches up the emits. I have no idea why. Maybe they're happening too quickly for it to keep up? except Exception as e: print(f'{product} - {shop.shop_name}', repr(e)) if bot is not None: bot.send_message_with_tag(f'{product} - {shop.shop_name}', repr(e)) except Exception as e: print(repr(e)) if bot is not None: bot.send_message_with_tag('', repr(e)) finally: print(f'Search for {product} took {time.time() - start_time}') emit('searching stop')
def parse(self, response): db_name = response.meta['name'] for i in response.xpath("//div[contains(@class, 'g')]"): print('**** G CLASS ****', i) raw_lnk = str(i.xpath(".//cite").extract()) clink = zone2(raw_lnk) print('Testing New Zone2: ', clink) if 'https://www.linkedin.com/in/' in clink: h3a = i.xpath(".//h3/a").extract() name, role1, firm1 = zone1(h3a) slp_xtract = i.xpath( ".//div[contains(@class, 'slp')]/descendant::text()" ).extract() print('Raw SLP Xtract: ', slp_xtract) print('LENGTH of SLP Xtract: ', len(slp_xtract)) if len(slp_xtract) > 0: txt = str(slp_xtract) print('length of slp: ', len(txt)) print('slp class detected. Running Zone3a Analysis...') city, role, firm = zone3a(txt) print('results from zone3a analysis: ') item = TrackItem() item['name'] = name item['link'] = clink item['ident'] = response.meta['lid'] item['location'] = city if role1 == None: item['role'] = role else: item['role'] = role1 if firm1 == None: item['firm'] = firm else: item['firm'] = firm1 score = score_name(item['name'], db_name) if score > 80: item['status'] = 'Success' yield item else: yield None else: print('no slp class found. salvaging text') st_class = i.xpath( ".//span[contains(@class, 'st')]/descendant::text()" ).extract() print('ST Text Extracted: ', st_class) salvage_string = list2string(st_class) print('st class converted to string: ', salvage_string) cleaned_str = clean_string(salvage_string, name) cleaned_str = cleaned_str.strip() print('st string filtered: ', cleaned_str) item = TrackItem() item['name'] = name item['link'] = clink item['location'] = None item['ident'] = response.meta['lid'] if role1 == None: item['role'] = None else: item['role'] = role1 if firm1 == None: if len(cleaned_str) > 100: print( ">>Cleaned string too long for db. Reducing to: ", cleaned_str[:99]) item['firm'] = cleaned_str[:99] else: item['firm'] = cleaned_str else: item['firm'] = firm1 score = score_name(item['name'], db_name) if score > 80: item['status'] = 'Success' yield item else: yield None
def parse(self, response): db_name = response.meta['name'] truelink = response.meta['truelink'] print('***') print('***') print('***') print('Parsing: ', db_name) for i in response.xpath("//div[@class='g']"): raw_lnk = str(i.xpath(".//cite").extract()) clink = zone2(raw_lnk) if 'https://www.linkedin.com/in/' in clink and clink == truelink: print('Links Matched. Proceeding...') print('DB Link: ', truelink) print('Scraped Link: ', clink) h3a = i.xpath(".//h3/a").extract() name, role1, firm1 = zone1(h3a) name_test = score_name(name, db_name) if name_test > 80: print('Passing Sore: ', name_test) slp_xtract = i.xpath( ".//div[contains(@class, 'slp')]/descendant::text()" ).extract() print('Raw SLP Xtract: ', slp_xtract) print('LENGTH of SLP Xtract: ', len(slp_xtract)) if len(slp_xtract) > 0: txt = str(slp_xtract) print('length of slp: ', len(txt)) print('slp class detected. Running Zone3a Analysis...') city, role, firm = zone3a(txt) print('results from zone3a analysis: ') item = S3TrackingItem() item['name'] = name item['link'] = clink item['ident'] = response.meta['lid'] item['location'] = city if role1 == None: item['role'] = role else: item['role'] = role1 if firm1 == None: item['firm'] = firm else: item['firm'] = firm1 yield item else: print('no slp class found. salvaging text') st_class = i.xpath( ".//span[contains(@class, 'st')]/descendant::text()" ).extract() print('ST Text Extracted: ', st_class) salvage_string = list2string(st_class) cleaned_str = clean_string(salvage_string, name) item = S3TrackingItem() item['name'] = name item['link'] = clink item['location'] = None item['ident'] = response.meta['lid'] if role1 == None: item['role'] = None else: item['role'] = role1 if firm1 == None: salvage_text = cleaned_str.strip() print('length of salvaged text: ', len(salvage_text)) if len(salvage_text) < 100: item['firm'] = salvage_text else: try: item['firm'] = salvage_text[:98] except: item['firm'] = None else: item['firm'] = firm1 yield item else: print('Failing Score: ', name_test) yield None else: print("Links Don't Match: ") print("DB Link: ", truelink) print('Scraped Link: ', clink) yield None
# Save top tags to file with open(config.paths.TAGS, 'w') as f: for tag in tags: f.write('%s\n' % (tag)) # Create word counter word_counter = collections.Counter() # Save posts to file config.paths.POST text_count = 0 word_count = 0 with open(config.paths.POST, 'w') as f: for post in helpers.get_posts_filtered(tags, max_posts=MAX_POSTS): title = helpers.clean_string(post['title']) body = helpers.clean_string(post['body']) tags = ' '.join(post['tags']) for text in [title, body]: for word in text.split(): word_counter[word] += 1 word_count += 1 line = config.text.delimitter.join([title, body, tags]) f.write('%s\n' % (line)) text_count += 1 # Save meta data config.text.save_meta_data(text_count=text_count)
def parse_vals(vals): r = [] for x in vals: r += [int(helpers.clean_string(x)) / 100000.0] return r
def test_clean_string_no_digits(self): result = helpers.clean_string('asdfASDF') self.assertEqual(result, 'asdfASDF')
def parse_vals(vals): r = [] for x in vals: r += [round(int(helpers.clean_string(x)) / 1024.0 * 5, 3)] return r
def overwrite(self, source): q = '' # check for a valid url if helpers.is_bad(source[1]) or source[1].find('.') == -1: return q # add url to graph url = '<http://' + urllib.parse.quote(source[1].replace( "http://", "").replace("https://", "")) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' graph = """ GRAPH """ + url # add url match = "{" + graph + "{ ?item wdt:P1896 ?url}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1896 " "" + url + """ }} WHERE {} ;""") # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): c = country_code else: c = helpers.clean(source[0]) match = "{" + graph + "{ ?item wdt:P17 ?country}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c + """ }} WHERE {} ;""") # add title if not helpers.is_bad(source[2]): match = "{" + graph + "{ ?item wdt:P1448 ?title}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1448 \'" + helpers.clean_string(source[2]) + """\' }} WHERE {} ;""") # add language if not helpers.is_bad(source[3]): match = "{" + graph + "{ ?item wdt:P37 ?lang}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P37 \'" + helpers.clean_string(source[3]) + """\' }} WHERE {} ;""") # add type if not helpers.is_bad(source[4]): match = "{" + graph + "{ ?item wdt:P31 ?type}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P31 \'" + helpers.clean_string(source[4]) + """\' }} WHERE {} ;""") # add title (native language) if not helpers.is_bad(source[5]): match = "{" + graph + "{ ?item wdt:P1704 ?title}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P1704 \'" + helpers.clean_string(source[5]) + """\'}} WHERE {} ;""") # add paywall if not helpers.is_bad(source[6]): match = "{" + graph + "{ ?item wnp:paywalled ?pw}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:paywalled \'" + helpers.clean_string(source[2]) + """\' }} WHERE {} ;""") # add metasources for ms in source[7]: if not helpers.is_bad(ms): match = "{" + graph + "{ ?item wnp:metasource ?ms}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:metasource wni:" + helpers.strip_spaces(ms).lower() + """ }} WHERE {} ;""") # add state if not helpers.is_bad(source[8]): match = "{" + graph + "{ ?item wdt:P131 ?state}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wdt:P131 \'" + helpers.clean_string(source[8]) + """\' }} WHERE {} ;""") # add wikipedia name if not helpers.is_bad(source[10]): match = "{" + graph + "{ ?item wnp:wikipedia-name ?wp_name}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:wikipedia-name \'" + helpers.clean_string(source[10]) + """\' }} WHERE {} ;""") # add redirects? if not helpers.is_bad(source[11]): match = "{" + graph + "{ ?item wnp:redirect ?rd}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:redirect \'" + helpers.clean_string(source[11]) + """\' }} WHERE {} ;""") # add wikipedia link if not helpers.is_bad(source[12]): match = "{" + graph + "{ ?item wnp:wikipedia-page ?wp_page}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:wikipedia-page \'" + helpers.clean_string(source[12]) + """\' }} WHERE {} ;""") # add description try: if not helpers.is_bad(source[14]): match = "{" + graph + "{ ?item wnp:description ?desc}}" q += ("DELETE" + match + """ INSERT { """ + graph + " {" + url_item + " wnp:description \'" + helpers.clean_string(source[14]) + """\' }} WHERE {} ;""") except IndexError: None return q
def get_path_spec(self, paths): q = '' for path in paths: q += """; wnp:haspath \'""" + helpers.clean_string(path) + "\'" return q
def first_load(self, source): # checks for bad URLs if helpers.is_bad(source[1]) or source[1].find('.') == -1: return '' # insert URL url = '<http://' + urllib.parse.quote(source[1]) + '>' url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>' q = """GRAPH """ + url + """ { """ + url_item + """ wdt:P1896 """ + url # add country if not helpers.is_bad(source[0]): country_code = self.get_country_code(source[0]) if not helpers.is_bad(country_code): q += """; wdt:P17 """ + country_code else: q += """; wdt:P17 \'""" + helpers.clean_string(source[0]) + """\' """ # add title if not helpers.is_bad(source[2]): q += """; wdt:P1448 \'""" + helpers.clean_string(source[2]) + """\' """ # add language if not helpers.is_bad(source[3]): q += """; wdt:P37 \'""" + helpers.clean_string(source[3]) + """\' """ #add type if not helpers.is_bad(source[4]): q += """; wdt:P31 \'""" + helpers.clean_string(source[4]) + """\' """ #add title (native language) if not helpers.is_bad(source[5]): q += """; wdt:P1704 \'""" + helpers.clean_string(source[5]) + """\' """ # add paywall if not helpers.is_bad(source[6]): q += """; wnp:paywalled \'""" + helpers.clean_string( source[6]) + """\' """ # add metasources if not helpers.is_bad(source[7]): q += self.get_ms(source[7]) # add state if not helpers.is_bad(source[8]): q += """; wdt:P131 \'""" + helpers.clean_string(source[8]) + """\' """ # add town if not helpers.is_bad(source[9]): q += """; wdt:P131 \'""" + helpers.clean_string(source[9]) + """\' """ # add wikipedia name if not helpers.is_bad(source[10]): q += """; wnp:wikipedia-name \'""" + helpers.clean_string( source[10]) + "\' " # add redirects? if not helpers.is_bad(source[11]): q += """; wnp:redirect \'""" + helpers.clean_string( source[11]) + """\' """ # add wikipedia link if not helpers.is_bad(source[12]): q += """; wnp:wikipedia-page \'""" + urllib.parse.quote( source[12]) + """\'""" # add paths if not helpers.is_bad(source[13]): q += self.get_path_spec(source[13]) # add description if not helpers.is_bad(source[14]): q += """; wnp:description \'""" + helpers.clean_string(source[14]) + "\'" q += """.}""" return q