def _get_doc_by_title(self, aliases_dict):
        doc = None
        try:
            biblio = aliases_dict["biblio"][0]
            biblio_title = remove_punctuation(biblio["title"]).lower()
            biblio_year = str(biblio["year"])
            if biblio_title and biblio_year:
                try:
                    doc = self.session.catalog.advanced_search(
                            title=biblio_title, 
                            min_year=biblio_year, 
                            max_year=biblio_year,
                            view='stats').list(page_size=1).items[0]
                except (UnicodeEncodeError, IndexError):
                    biblio_title = remove_punctuation(biblio["title"].encode('ascii','ignore'))
                    try:
                        doc = self.session.catalog.advanced_search(
                                title=biblio_title, 
                                min_year=biblio_year, 
                                max_year=biblio_year,
                                view='stats').list(page_size=1).items[0]
                    except (IndexError):
                        return None

                mendeley_title = remove_punctuation(doc.title).lower()
                if biblio_title != mendeley_title:
                    logger.debug(u"Mendeley: titles don't match so not using this match /biblio_print %s and %s" %(
                        biblio_title, mendeley_title))
                    doc = None
        except (KeyError, MendeleyException):
            # logger.info(u"No biblio found in _get_doc_by_title")
            pass
        return doc
Beispiel #2
0
    def _find_product_page(self, use_organic=True):
        '''Find the Product Page from the Company's website.

        Sometimes when there is only one result, a Site will return the
        result's details page instead of a search results page. A Site can set
        the :data:`SEARCH_REDIRECTED_TEXT` class attribute to handle this.

        :param use_organic: Whether or not to check for a non-organic version
                            of the product.
        :type use_organic: bool
        :returns: The Product Page's HTML or :obj:`None`
        :rtype: :obj:`str`

        '''
        search_terms = remove_punctuation(self.sese_name)
        if use_organic and self.sese_organic:
            search_terms += " organic"
        if self.INCLUDE_CATEGORY_IN_SEARCH:
            search_terms += ' ' + remove_punctuation(self.sese_category)
        search_page = self._search_site(search_terms)

        if self.SEARCH_REDIRECTED_TEXT is not None:
            if self.SEARCH_REDIRECTED_TEXT in search_page:
                return search_page

        match = self._get_best_match_or_none(search_page)
        check_without_organic = (self.sese_organic and match is None
                                 and use_organic)
        return (match if not check_without_organic else
                self._find_product_page(use_organic=False))
Beispiel #3
0
    def _prepend_name_match_amounts(self, search_results):
        '''Prepend the % of SESE Name matched to the ``search_results`` list.

        ``search_results`` should be a list of (URL, Name) tuples.

        This method iterates through the provided ``search_results`` comparing
        the Product Name with the SESE Product Name by calculating the
        percentage of words in the Company's Name that are also in the SESE
        Name.

        The match percentage will be prepended to each :obj:`tuple` in the
        ``search_results`` returning a list of ``[(Match Percentage, (URL,
        Name)),...]``

        :param search_results: A list of tuples containing the ``(URL, Name)``
                               of each matching Product
        :type search_results: list
        :returns: A list of tupes containing ``(Match%, (URL, Name))`` of each
                  Product
        :rtype: :obj:`list`

        '''
        sese_words = [
            remove_punctuation(x) for x in self.sese_name.lower().split() +
            self.sese_category.lower().split()
        ]
        number_of_sese_words = len(sese_words)
        output = []
        for result in search_results:
            number_of_matches = 0
            site_words = [
                remove_punctuation(x) for x in result[1].lower().split()
            ]
            number_of_site_words = len(site_words)

            for word in site_words:
                if word in sese_words:
                    number_of_matches += 1

            percent_site_words_matched = (float(number_of_matches) /
                                          number_of_site_words * 100)
            site_to_sese_word_ratio = (float(number_of_site_words) /
                                       number_of_sese_words)
            percent_sese_words_matched = min(
                float(number_of_matches) / number_of_sese_words * 100, 100)
            sese_to_site_word_ratio = (float(number_of_sese_words) /
                                       number_of_site_words)

            match_percentage = (
                percent_site_words_matched * site_to_sese_word_ratio +
                percent_sese_words_matched * sese_to_site_word_ratio) / 2
            output.append((match_percentage, result))
        output.sort(key=lambda x: x[0], reverse=True)
        return output
def is_open_via_doaj_issn(issns, pub_year=None):
    if issns:
        for issn in issns:
            issn = remove_punctuation(issn)
            for (row_issn, row_license, doaj_start_year) in doaj_issns:
                if issn == remove_punctuation(row_issn):
                    if doaj_start_year and pub_year and (doaj_start_year > pub_year):
                        pass # journal wasn't open yet!
                    else:
                        # logger.info(u"open: doaj issn match!")
                        return find_normalized_license(row_license)
    return False
Beispiel #5
0
def set_mendeley_data(product):

    resp = None
    doc = None

    try:
        mendeley_session = get_mendeley_session()
        if product.doi:
            method = "doi"
            try:
                doc = mendeley_session.catalog.by_identifier(doi=product.doi,
                                                             view='stats')
            except (UnicodeEncodeError, IndexError):
                return None

        elif product.title and product.year:
            biblio_title = remove_punctuation(product.title).lower()
            biblio_year = product.year
            try:
                method = "title"
                doc = mendeley_session.catalog.advanced_search(
                    title=biblio_title,
                    min_year=biblio_year,
                    max_year=biblio_year,
                    view='stats').list(page_size=1).items[0]
                mendeley_title = remove_punctuation(doc.title).lower()
                if biblio_title != mendeley_title:
                    return None
            except (UnicodeEncodeError, IndexError):
                return None

        if not doc:
            return None

        # print u"\nMatch! got the mendeley paper! for title {}".format(biblio_title)
        # print "got mendeley for {} using {}".format(product.id, method)
        resp = {}
        resp["reader_count"] = doc.reader_count
        resp[
            "reader_count_by_academic_status"] = doc.reader_count_by_academic_status
        resp[
            "reader_count_by_subdiscipline"] = doc.reader_count_by_subdiscipline
        resp["reader_count_by_country"] = doc.reader_count_by_country
        resp["mendeley_url"] = doc.link
        resp["abstract"] = doc.abstract
        resp["method"] = method

    except (KeyError, MendeleyException):
        pass

    return resp
def set_mendeley_data(product):

    resp = None
    doc = None

    try:
        mendeley_session = get_mendeley_session()
        if product.doi:
            method = "doi"
            try:
                doc = mendeley_session.catalog.by_identifier(
                        doi=product.doi,
                        view='stats')
            except (UnicodeEncodeError, IndexError):
                return None

        elif product.title and product.year:
            biblio_title = remove_punctuation(product.title).lower()
            biblio_year = product.year
            try:
                method = "title"
                doc = mendeley_session.catalog.advanced_search(
                        title=biblio_title,
                        min_year=biblio_year,
                        max_year=biblio_year,
                        view='stats').list(page_size=1).items[0]
                mendeley_title = remove_punctuation(doc.title).lower()
                if biblio_title != mendeley_title:
                    return None
            except (UnicodeEncodeError, IndexError):
                return None

        if not doc:
            return None

        # print u"\nMatch! got the mendeley paper! for title {}".format(biblio_title)
        # print "got mendeley for {} using {}".format(product.id, method)
        resp = {}
        resp["reader_count"] = doc.reader_count
        resp["reader_count_by_academic_status"] = doc.reader_count_by_academic_status
        resp["reader_count_by_subdiscipline"] = doc.reader_count_by_subdiscipline
        resp["reader_count_by_country"] = doc.reader_count_by_country
        resp["mendeley_url"] = doc.link
        resp["abstract"] = doc.abstract
        resp["method"] = method

    except (KeyError, MendeleyException):
        pass

    return resp
Beispiel #7
0
def format_text_udf(text):
    return functions.udf(
        lambda t: remove_punctuation(
            REMOVE_URL_EXPR.sub(
                "",
                strip_accents(t.lower().replace('\t', ' ').replace('\n', ' ')))
        ), types.StringType())(text)
Beispiel #8
0
def fake_spaces_etc(s, text):
    """Revised the provided text such that it does not include any
       character present in ch.
       - The FAKE_SPACE character is used to space or tab.
       - Removes carriage returns.
       - Punctuation characters are removed with FAKE_SPACE protected
         should it be a punctuation character.
       - members of s are converted hex
    """
    # use FAKE_SPACE instead space and tab
    text = text.replace(' ', FAKE_SPACE)
    text = text.replace('\t', FAKE_SPACE)
    text = text.replace('\r', '')

    # Remove all punctuation but preserve FAKE_SPACE
    non_punctuation_character = chr(3)
    text = text.replace(FAKE_SPACE, non_punctuation_character)
    text = remove_punctuation(text)
    text = text.replace(non_punctuation_character, FAKE_SPACE)
    assert '-' not in s  # ASCII dash
    assert '—' not in s  # Unicode long dash ?

    # Convert to hex any members of s found in text
    for ch in s:
        if ch in text:
            text = text.replace(ch, f"<x{ord(ch):02x}>")

    return text
def anagram_hash(word, ignore_punc=True):
    """
    Returns a hash of the given word, suitable for checking anagram equality:

    >>> anagram_hash("fiber")
    'befir'
    >>> anagram_hash("brief")
    'befir'

    It ignores character cases, and punctuation by default:

    >>> anagram_hash("It's")
    'ist'
    >>> anagram_hash("sit")
    'ist'

    To consider punctuation as part of the anagram, pass ignore_punc=False

    >>> anagram_hash("it's", ignore_punc=False)
    "'ist"

    """

    # Remove punctuation, if requested
    if ignore_punc:
        word = remove_punctuation(word)
    # Convert the word to lowercase, then sort its letters
    # This gives us a string that will only be equal for words that are anagrams
    return "".join(sorted(word.lower()))
Beispiel #10
0
def add_to_index(p, name, id):
    text = name.lower()
    text = util.remove_accents(text)
    text = util.remove_punctuation(text)
    words = text.split()
    swords = set(words)
    for word in swords:
        w = 'si-' + word
        p.sadd(w, id)
Beispiel #11
0
def artist_search(text):
    lwords = set()
    text = text.lower()
    text = util.remove_accents(text)
    text = util.remove_punctuation(text)
    words = text.split()
    swords = set(words)
    for word in swords:
        w = 'si-' + word
        lwords.append(w)
    aids = r.sinter(lwords)
    print 'as', lwords, aids
    return list(aids)
Beispiel #12
0
    def _get_best_match_or_none(self, search_page_html):
        '''Attempt to find the best match on the Search Results HTML.

        The method will first attempt to find a Product that contains the name
        of the SESE variety. Otherwise it will use the Product with the most
        words in common with the SESE variety name, if a minimum percentage of
        the words match(specified by
        :data:`settings.MINIMUM_NAME_MATCHING_PERCENTAGE`).

        If no results are found, the method will return :obj:`None`.

        :param search_page_html: The Search Results Page's HTML
        :type search_page_html: str
        :returns: Product Page HTML of the best match or :obj:`None` if no good
                  match is found
        :rtype: :obj:`str`
        '''
        products = self._get_results_from_search_page(search_page_html)
        has_no_results = len(products) == 0 or (self.NO_RESULT_TEXT is not None
                                                and self.NO_RESULT_TEXT
                                                in search_page_html)
        if has_no_results:
            return None

        for product in products:
            relative_url, product_name = product
            clean_product_name = remove_punctuation(product_name).lower()
            clean_sese_name = remove_punctuation(self.sese_name).lower()
            if clean_sese_name in clean_product_name:
                page_url = self.ROOT_URL + relative_url
                return get_page_html(page_url)

        product_ranks = self._prepend_name_match_amounts(products)
        best_match = product_ranks[0]
        match_amount = best_match[0]
        if match_amount >= settings.MINIMUM_NAME_MATCHING_PERCENTAGE:
            match_url = self.ROOT_URL + best_match[1][0]
            return get_page_html(match_url)
Beispiel #13
0
    def process_tweet(self, tweet):
        ''' Process one tweet '''
        # Save all tweet collected
        util.save_tweet(tweet, self.output_all_tweets)

        # Transform the tweet's text in lowercase
        text = tweet['text'].lower()

        # Remove punctuation
        text = util.remove_punctuation(text)

        self.count_all = self.count_all + 1

        # Check whether the tweet is a crime related tweet
        if self.clf_crime.predict([text]) == 1:
            # Define the type of crime of this tweet
            type_crime = self.description[self.clf_typecrime.predict([text])]

            # Save the tweets related to crime with theirs respective types
            tweet['type_crime'] = type_crime

            util.save_tweet(tweet, self.output_crime_related)

            self.count_crime_related = self.count_crime_related + 1

            # Apply the alias dictionary to the text
            text = self.apply_alias(text)

            # Extract full address
            street_address = self.extract_full_address(text)

            # Extract the location of the tweet
#            street_address = self.extract_street(text)

            if street_address != "":
                self.count_with_location = self.count_with_location + 1

                tweet['street_address'] = street_address

                # Extract the state information
#                tweet['state'] = self.extract_state(text)

                util.save_tweet(tweet, self.output_with_location)

        #print("All\tCrimeRelated\tLocation")
        if self.count_all % 20 == 0:
            print("%d\t%d\t\t%d" % \
                    (self.count_all, self.count_crime_related, \
                    self.count_with_location))
def synset_review(review):
    review = unicodedata.normalize('NFKD', review).encode('ascii', 'ignore')
    review = remove_stopwords(remove_punctuation(review.lower()))
    words = review.split()
    return ' '.join([' '.join(synset_word(word)) for word in words])
Beispiel #15
0
def filter_name(text):
    text = text.lower()
    text = util.remove_accents(text)
    text = util.remove_punctuation(text)
    return text
def synset_review(review):
	review = unicodedata.normalize('NFKD', review).encode('ascii','ignore')
	review = remove_stopwords(remove_punctuation(review.lower()))
	words = review.split()
	return ' '.join([' '.join(synset_word(word)) for word in words])
Beispiel #17
0
def extract_words_fsize_line_from_page_vertical_region(page,
                                                       next_page,
                                                       region,
                                                       min_fsize=-1,
                                                       min_len=2):
    """
    Exctract words, their fontsize and the line in which they sit from the top portion of the page whose font size is
    big enough
    :type page: bs4.BeautifulSoup
    :type next_page: bs4.BeautifulSoup or None
    :param region: portion of the page to extract(between 0 and 1=all)
    :type region: float
    :param min_fsize: minimum font size of a word to be extracted
    :type min_fsize: float
    :param min_len: minimum lenght of a word to be extracted
    :type min_len: int
    :rtype: list[(str,int,int)]
    """
    # Find start of current page and next page
    page_top = util.get_coordinate_from_style(page.contents[0]['style'], 'top')
    default_page_size = 1000
    if next_page is None:
        # Last pages are badly formatted anyway, just take an approximation
        next_page_top = page_top + default_page_size
    else:
        next_page_top = util.get_coordinate_from_style(
            next_page.contents[0]['style'], 'top')

    current_line = ""
    words_fsize = []
    last_top = 0
    tag_fsize = min_fsize
    line_number = 0
    for tag in page.children:
        if hasattr(tag, "style"):
            if tag.name == "div" or tag.name == "span":

                # Check font size
                tag_fsize = util.get_coordinate_from_style(
                    tag['style'], "font-size")
                if tag_fsize is None or tag_fsize == 0:
                    tag_fsize = min_fsize
                if tag_fsize >= min_fsize:
                    # Check position in page
                    tag_top = util.get_coordinate_from_style(
                        tag['style'], 'top')
                    if tag_top is not None:
                        if tag_top < region * (next_page_top -
                                               page_top) + page_top:

                            if tag_top == last_top:
                                current_line += util.remove_punctuation(
                                    "".join(tag.strings))
                            else:
                                # Remove multiple newlines and spaces
                                current_line = re.sub(r'\n+', ' ',
                                                      current_line)
                                current_line = re.sub(r'  +', ' ',
                                                      current_line)
                                single_words = current_line.split(" ")
                                single_words = [
                                    word for word in single_words
                                    if not len(word) < min_len
                                ]

                                if len(single_words) > 0:
                                    words_fsize += [(word, tag_fsize,
                                                     line_number)
                                                    for word in single_words]
                                line_number += 1
                                current_line = util.remove_punctuation("".join(
                                    tag.strings))
                            last_top = tag_top

    if len(current_line) > 0:
        current_line = re.sub(r'\n+', ' ', current_line)
        current_line = re.sub(r'  +', ' ', current_line)
        single_words = current_line.split(" ")
        single_words = [
            word for word in single_words if not len(word) < min_len
        ]

        if len(single_words) > 0:
            words_fsize += [(word, tag_fsize, line_number)
                            for word in single_words]

    return words_fsize