Ejemplo n.º 1
0
    def handleAnnotation(self, cell, annotation) :
        """
        Create relevant triples for the annotation attached to the cell
        """
        
        # Create triples according to Open Annotation model
        annotation_URI = cell['URI'] + "-oa"
        annotation_body_URI = annotation_URI + '-body'

        self.graph.add((annotation_URI, RDF.type, OA.Annotation))
        self.graph.add((annotation_URI, OA.hasTarget, cell['URI']))
        self.graph.add((annotation_URI, OA.hasBody, annotation_body_URI))
        
        self.graph.add((annotation_body_URI, RDF.type, RDFS.Resource))
        self.graph.add((annotation_body_URI,
                        TABLINKER.value,
                        Literal(clean_string(getText(annotation)))))
        
        # Extract author
        author = annotation.getElementsByType(dc.Creator)
        if len(author) > 0:
            author = clean_string(str(author[0]))
            self.graph.add((annotation_body_URI, OA.annotatedBy, Literal(author)))
            
        # Extract date
        creation_date = annotation.getElementsByType(dc.Date)
        if len(creation_date) > 0:
            creation_date = str(creation_date[0])
            self.graph.add((annotation_body_URI, OA.serializedAt, Literal(creation_date, datatype=XSD.date)))
Ejemplo n.º 2
0
 def handleTitle(self, cell) :
     """
     Create relevant triples for the cell marked as Title 
     """
     self.graph.add((cell['sheetURI'],
                     RDFS.comment,
                     Literal(clean_string(cell['value']))))        
Ejemplo n.º 3
0
 def _create_cell(self, cell, cell_type):
     """
     Create a new cell
     """
     
     # Set the value
     value = Literal(clean_string(cell['value']))
         
     # It's a cell
     self.graph.add((cell['URI'], RDF.type, cell_type))
     
     # It's in the data set defined by the current sheet
     self.graph.add((cell['URI'], TABLINKER.sheet, cell['sheetURI']))
     
     # Add its value (removed the datatype=XSD.decimal because we can't be sure)
     self.graph.add((cell['URI'], TABLINKER.value, value))
     
     # Add a cell label
     label = "%s=%s" % (cell['name'], cell['value'])
     self.graph.add((cell['URI'], RDFS.label, Literal(label)))
Ejemplo n.º 4
0
def sequence_gene_filter(list_from_file):
    gene_acronym = []
    gene_sequence = []
    gene_seq = ''
    for line in list_from_file:
        line_length = len(line)
        refresh = False
        if line_length < 10:
            refresh = True
            gene_acronym.append(line)
        else:
            gene_sequence.append(line)
            gene_seq += line
        if refresh:
            gene_sequence.clear()
            gene_seq = ''

    str_seq = ''
    just_gene_seq = []
    for line in gene_sequence:
        str_seq += clean_string(line)
    str_seq.upper()
    just_gene_seq.append(str_seq)
    return gene_acronym, just_gene_seq
Ejemplo n.º 5
0
def search_product(data):
    product = clean_string(data['product'])
    max_search_length = int(escape(data['max_returned']))
    s = search.shops(product, max_search_length)
    emit('searching start', {'search_length': len(s)})
    print(f'Searching for {product}')
    start_time = time.time()

    # clear cache if request is more than max_cache_time since previous request
    global most_recent_cache
    global cache
    if start_time - most_recent_cache >= max_cache_time:
        most_recent_cache = start_time
        cache = dict()

    try:
        emit('search length', f'{len(s)}')
        for i, shop in enumerate(s):
            try:
                # deal with cache - if key exists, assign value without bothering to re-search
                cache_key = f'{shop.shop_name}.{product}.{shop.max_search_length}'
                cache_value = cache.get(cache_key)
                if cache_value is not None:
                    result = cache_value
                else:
                    if shop.json_selector is not None:
                        try:
                            result = search.search_shop_details_json(shop)
                        except Exception as e:
                            print("oh no!", e)
                            error_message = f"SEARCH FAILED FOR SHOP [{shop.shop_name}] AND PRODUCT [{product}], REVERTING TO DEFAULT"
                            if not (isinstance(e, HTTPError) and shop.shop_name
                                    == "ALDI" and e.code == 503):
                                if bot is not None:
                                    bot.send_message(error_message, e)
                                print(
                                    f"SEARCH FAILED FOR SHOP [{shop.shop_name}] AND PRODUCT [{product}], REVERTING TO DEFAULT"
                                )
                                print(repr(e))
                            page_source = search.load_page_source(shop)
                            result = search.search_page_source(
                                page_source, shop)
                    else:
                        page_source = search.load_page_source(shop)
                        result = search.search_page_source(page_source, shop)
                    # update cache with new value for shop/product/max combination
                    cache[cache_key] = result
                emit(
                    'result', {
                        'shop_name': shop.shop_name,
                        'result': result,
                        'shop_number': i + 1
                    })
                socketio.sleep(
                )  # Without this sleep, the app batches up the emits. I have no idea why. Maybe they're happening too quickly for it to keep up?
            except Exception as e:
                print(f'{product} - {shop.shop_name}', repr(e))
                if bot is not None:
                    bot.send_message_with_tag(f'{product} - {shop.shop_name}',
                                              repr(e))
    except Exception as e:
        print(repr(e))
        if bot is not None:
            bot.send_message_with_tag('', repr(e))
    finally:
        print(f'Search for {product} took {time.time() - start_time}')
        emit('searching stop')
Ejemplo n.º 6
0
    def parse(self, response):
        db_name = response.meta['name']
        for i in response.xpath("//div[contains(@class, 'g')]"):
            print('**** G CLASS ****', i)
            raw_lnk = str(i.xpath(".//cite").extract())
            clink = zone2(raw_lnk)
            print('Testing New Zone2: ', clink)
            if 'https://www.linkedin.com/in/' in clink:
                h3a = i.xpath(".//h3/a").extract()
                name, role1, firm1 = zone1(h3a)
                slp_xtract = i.xpath(
                    ".//div[contains(@class, 'slp')]/descendant::text()"
                ).extract()
                print('Raw SLP Xtract: ', slp_xtract)
                print('LENGTH of SLP Xtract: ', len(slp_xtract))

                if len(slp_xtract) > 0:
                    txt = str(slp_xtract)
                    print('length of slp: ', len(txt))
                    print('slp class detected. Running Zone3a Analysis...')
                    city, role, firm = zone3a(txt)
                    print('results from zone3a analysis: ')
                    item = TrackItem()
                    item['name'] = name
                    item['link'] = clink
                    item['ident'] = response.meta['lid']
                    item['location'] = city
                    if role1 == None:
                        item['role'] = role
                    else:
                        item['role'] = role1
                    if firm1 == None:
                        item['firm'] = firm
                    else:
                        item['firm'] = firm1
                    score = score_name(item['name'], db_name)
                    if score > 80:
                        item['status'] = 'Success'
                        yield item
                    else:
                        yield None

                else:
                    print('no slp class found.  salvaging text')
                    st_class = i.xpath(
                        ".//span[contains(@class, 'st')]/descendant::text()"
                    ).extract()
                    print('ST Text Extracted: ', st_class)
                    salvage_string = list2string(st_class)
                    print('st class converted to string: ', salvage_string)
                    cleaned_str = clean_string(salvage_string, name)
                    cleaned_str = cleaned_str.strip()
                    print('st string filtered: ', cleaned_str)
                    item = TrackItem()
                    item['name'] = name
                    item['link'] = clink
                    item['location'] = None
                    item['ident'] = response.meta['lid']
                    if role1 == None:
                        item['role'] = None
                    else:
                        item['role'] = role1
                    if firm1 == None:
                        if len(cleaned_str) > 100:
                            print(
                                ">>Cleaned string too long for db. Reducing to: ",
                                cleaned_str[:99])
                            item['firm'] = cleaned_str[:99]
                        else:
                            item['firm'] = cleaned_str
                    else:
                        item['firm'] = firm1
                    score = score_name(item['name'], db_name)
                    if score > 80:
                        item['status'] = 'Success'
                        yield item
                    else:
                        yield None
Ejemplo n.º 7
0
    def parse(self, response):
        db_name = response.meta['name']
        truelink = response.meta['truelink']
        print('***')
        print('***')
        print('***')
        print('Parsing: ', db_name)
        for i in response.xpath("//div[@class='g']"):
            raw_lnk = str(i.xpath(".//cite").extract())
            clink = zone2(raw_lnk)
            if 'https://www.linkedin.com/in/' in clink and clink == truelink:
                print('Links Matched. Proceeding...')
                print('DB Link: ', truelink)
                print('Scraped Link: ', clink)
                h3a = i.xpath(".//h3/a").extract()
                name, role1, firm1 = zone1(h3a)

                name_test = score_name(name, db_name)
                if name_test > 80:
                    print('Passing Sore: ', name_test)
                    slp_xtract = i.xpath(
                        ".//div[contains(@class, 'slp')]/descendant::text()"
                    ).extract()
                    print('Raw SLP Xtract: ', slp_xtract)
                    print('LENGTH of SLP Xtract: ', len(slp_xtract))

                    if len(slp_xtract) > 0:
                        txt = str(slp_xtract)
                        print('length of slp: ', len(txt))
                        print('slp class detected. Running Zone3a Analysis...')
                        city, role, firm = zone3a(txt)
                        print('results from zone3a analysis: ')
                        item = S3TrackingItem()
                        item['name'] = name
                        item['link'] = clink
                        item['ident'] = response.meta['lid']
                        item['location'] = city
                        if role1 == None:
                            item['role'] = role
                        else:
                            item['role'] = role1
                        if firm1 == None:
                            item['firm'] = firm
                        else:
                            item['firm'] = firm1

                        yield item

                    else:
                        print('no slp class found.  salvaging text')
                        st_class = i.xpath(
                            ".//span[contains(@class, 'st')]/descendant::text()"
                        ).extract()
                        print('ST Text Extracted: ', st_class)
                        salvage_string = list2string(st_class)
                        cleaned_str = clean_string(salvage_string, name)
                        item = S3TrackingItem()
                        item['name'] = name
                        item['link'] = clink
                        item['location'] = None
                        item['ident'] = response.meta['lid']
                        if role1 == None:
                            item['role'] = None
                        else:
                            item['role'] = role1
                        if firm1 == None:
                            salvage_text = cleaned_str.strip()
                            print('length of salvaged text: ',
                                  len(salvage_text))
                            if len(salvage_text) < 100:
                                item['firm'] = salvage_text
                            else:
                                try:
                                    item['firm'] = salvage_text[:98]
                                except:
                                    item['firm'] = None
                        else:
                            item['firm'] = firm1
                        yield item

                else:
                    print('Failing Score: ', name_test)
                    yield None
            else:
                print("Links Don't Match: ")
                print("DB Link: ", truelink)
                print('Scraped Link: ', clink)
                yield None
Ejemplo n.º 8
0
    # Save top tags to file
    with open(config.paths.TAGS, 'w') as f:
        for tag in tags:
            f.write('%s\n' % (tag))

    # Create word counter
    word_counter = collections.Counter()

    # Save posts to file
    config.paths.POST
    text_count = 0
    word_count = 0
    with open(config.paths.POST, 'w') as f:
        for post in helpers.get_posts_filtered(tags, max_posts=MAX_POSTS):
            title = helpers.clean_string(post['title'])
            body = helpers.clean_string(post['body'])
            tags = ' '.join(post['tags'])

            for text in [title, body]:
                for word in text.split():
                    word_counter[word] += 1
                    word_count += 1

            line = config.text.delimitter.join([title, body, tags])

            f.write('%s\n' % (line))
            text_count += 1

    # Save meta data
    config.text.save_meta_data(text_count=text_count)
Ejemplo n.º 9
0
 def parse_vals(vals):
     r = []
     for x in vals:
         r += [int(helpers.clean_string(x)) / 100000.0]
     return r
Ejemplo n.º 10
0
 def test_clean_string_no_digits(self):
     result = helpers.clean_string('asdfASDF')
     self.assertEqual(result, 'asdfASDF')
Ejemplo n.º 11
0
 def parse_vals(vals):
     r = []
     for x in vals:
         r += [round(int(helpers.clean_string(x)) / 1024.0 * 5, 3)]
     return r
Ejemplo n.º 12
0
    def overwrite(self, source):
        q = ''

        # check for a valid url
        if helpers.is_bad(source[1]) or source[1].find('.') == -1:
            return q

        # add url to graph
        url = '<http://' + urllib.parse.quote(source[1].replace(
            "http://", "").replace("https://", "")) + '>'
        url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>'
        graph = """ GRAPH """ + url

        # add url
        match = "{" + graph + "{ ?item wdt:P1896 ?url}}"
        q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P1896 "
              "" + url + """ }} 
              WHERE {} ;""")

        # add country
        if not helpers.is_bad(source[0]):
            country_code = self.get_country_code(source[0])
            if not helpers.is_bad(country_code):
                c = country_code
            else:
                c = helpers.clean(source[0])
            match = "{" + graph + "{ ?item wdt:P17 ?country}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P17 " + c +
                  """ }} 
              WHERE {} ;""")

        # add title
        if not helpers.is_bad(source[2]):
            match = "{" + graph + "{ ?item wdt:P1448 ?title}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P1448 \'" +
                  helpers.clean_string(source[2]) + """\' }} 
             WHERE {} ;""")

        # add language
        if not helpers.is_bad(source[3]):
            match = "{" + graph + "{ ?item wdt:P37 ?lang}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P37 \'" +
                  helpers.clean_string(source[3]) + """\' }} 
              WHERE {} ;""")

        # add type
        if not helpers.is_bad(source[4]):
            match = "{" + graph + "{ ?item wdt:P31 ?type}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P31 \'" +
                  helpers.clean_string(source[4]) + """\' }} 
              WHERE {} ;""")

        # add title (native language)
        if not helpers.is_bad(source[5]):
            match = "{" + graph + "{ ?item wdt:P1704 ?title}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P1704 \'" +
                  helpers.clean_string(source[5]) + """\'}} 
              WHERE {} ;""")

        # add paywall
        if not helpers.is_bad(source[6]):
            match = "{" + graph + "{ ?item wnp:paywalled ?pw}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wnp:paywalled \'" +
                  helpers.clean_string(source[2]) + """\' }} 
              WHERE {} ;""")

        # add metasources
        for ms in source[7]:
            if not helpers.is_bad(ms):
                match = "{" + graph + "{ ?item wnp:metasource ?ms}}"
                q += ("DELETE" + match + """
                  INSERT { """ + graph + " {" + url_item +
                      " wnp:metasource wni:" +
                      helpers.strip_spaces(ms).lower() + """ }} 
                  WHERE {} ;""")

        # add state
        if not helpers.is_bad(source[8]):
            match = "{" + graph + "{ ?item wdt:P131 ?state}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wdt:P131 \'" +
                  helpers.clean_string(source[8]) + """\' }} 
              WHERE {} ;""")

        # add wikipedia name
        if not helpers.is_bad(source[10]):
            match = "{" + graph + "{ ?item wnp:wikipedia-name ?wp_name}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item +
                  " wnp:wikipedia-name \'" + helpers.clean_string(source[10]) +
                  """\' }} 
              WHERE {} ;""")

        # add redirects?
        if not helpers.is_bad(source[11]):
            match = "{" + graph + "{ ?item wnp:redirect ?rd}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item + " wnp:redirect \'" +
                  helpers.clean_string(source[11]) + """\' }} 
              WHERE {} ;""")

        # add wikipedia link
        if not helpers.is_bad(source[12]):
            match = "{" + graph + "{ ?item wnp:wikipedia-page ?wp_page}}"
            q += ("DELETE" + match + """
              INSERT { """ + graph + " {" + url_item +
                  " wnp:wikipedia-page \'" + helpers.clean_string(source[12]) +
                  """\' }} 
              WHERE {} ;""")

        # add description
        try:
            if not helpers.is_bad(source[14]):
                match = "{" + graph + "{ ?item wnp:description ?desc}}"
                q += ("DELETE" + match + """
                  INSERT { """ + graph + " {" + url_item +
                      " wnp:description \'" +
                      helpers.clean_string(source[14]) + """\' }} 
                  WHERE {} ;""")
        except IndexError:
            None

        return q
Ejemplo n.º 13
0
 def get_path_spec(self, paths):
     q = ''
     for path in paths:
         q += """;
             wnp:haspath \'""" + helpers.clean_string(path) + "\'"
     return q
Ejemplo n.º 14
0
    def first_load(self, source):

        # checks for bad URLs
        if helpers.is_bad(source[1]) or source[1].find('.') == -1:
            return ''

        # insert URL
        url = '<http://' + urllib.parse.quote(source[1]) + '>'
        url_item = '<http://' + urllib.parse.quote(source[1]) + '/item>'
        q = """GRAPH """ + url + """ { 
        """ + url_item + """ wdt:P1896 """ + url

        # add country
        if not helpers.is_bad(source[0]):
            country_code = self.get_country_code(source[0])
            if not helpers.is_bad(country_code):
                q += """;
                wdt:P17 """ + country_code
            else:
                q += """;
                wdt:P17 \'""" + helpers.clean_string(source[0]) + """\' """

        # add title
        if not helpers.is_bad(source[2]):
            q += """;
                wdt:P1448 \'""" + helpers.clean_string(source[2]) + """\' """

        # add language
        if not helpers.is_bad(source[3]):
            q += """;
                wdt:P37 \'""" + helpers.clean_string(source[3]) + """\' """

        #add type
        if not helpers.is_bad(source[4]):
            q += """;
                wdt:P31 \'""" + helpers.clean_string(source[4]) + """\' """

        #add title (native language)
        if not helpers.is_bad(source[5]):
            q += """;
                wdt:P1704 \'""" + helpers.clean_string(source[5]) + """\' """

        # add paywall
        if not helpers.is_bad(source[6]):
            q += """;
                wnp:paywalled \'""" + helpers.clean_string(
                source[6]) + """\' """

        # add metasources
        if not helpers.is_bad(source[7]):
            q += self.get_ms(source[7])

        # add state
        if not helpers.is_bad(source[8]):
            q += """;
                wdt:P131 \'""" + helpers.clean_string(source[8]) + """\' """

        # add town
        if not helpers.is_bad(source[9]):
            q += """;
                wdt:P131 \'""" + helpers.clean_string(source[9]) + """\' """

        # add wikipedia name
        if not helpers.is_bad(source[10]):
            q += """;
                wnp:wikipedia-name \'""" + helpers.clean_string(
                source[10]) + "\' "

        # add redirects?
        if not helpers.is_bad(source[11]):
            q += """;
                wnp:redirect \'""" + helpers.clean_string(
                source[11]) + """\' """

        # add wikipedia link
        if not helpers.is_bad(source[12]):
            q += """;
                wnp:wikipedia-page \'""" + urllib.parse.quote(
                source[12]) + """\'"""

        # add paths
        if not helpers.is_bad(source[13]):
            q += self.get_path_spec(source[13])

        # add description
        if not helpers.is_bad(source[14]):
            q += """;
                wnp:description \'""" + helpers.clean_string(source[14]) + "\'"

        q += """.}"""

        return q