Python is_stopword Examples, invenio.bibindex_engine_stopwords.is_stopword Python Examples

Example #1

0

Show file

def remove_stopwords(word, stopwords_kb=False):
    """Returns word after stopword check.
       One must specify the name of the knowledge base.

       @param word: word to be checked
       @type word: str
       @param stopwords_kb: name of the stopwords knowledge base
       @type word: str
    """
    if stopwords_kb:
        stopwords_path = CFG_ETCDIR + "/bibrank/" + stopwords_kb
        if is_stopword(word, stopwords_path):
            return ""
    return word

Example #2

0

Show file

File: bibindex_engine_washer.py Project: bopopescu/augustsedlacek

def apply_stemming_and_stopwords_and_length_check(word, stemming_language):
    """Return WORD after applying stemming and stopword and length checks.
       See the config file in order to influence these.
    """
    # now check against stopwords:
    if is_stopword(word):
        return ""
    # finally check the word length:
    if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH:
        return ""
    # stem word, when configured so:
    if stemming_language:
        word = stem(word, stemming_language)
    return word

Example #3

0

Show file

File: bibindex_engine_washer.py Project: chezjohnny/invenio

def apply_stemming_and_stopwords_and_length_check(word, stemming_language):
    """Return WORD after applying stemming and stopword and length checks.
       See the config file in order to influence these.
    """
    # now check against stopwords:
    if is_stopword(word):
        return ""
    # finally check the word length:
    if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH:
        return ""
    # stem word, when configured so:
    if stemming_language:
        word = stem(word, stemming_language)
    return word

Example #4

0

Show file

File: bibindex_engine_washer.py Project: BessemAamira/invenio

def remove_stopwords(word, stopwords_kb = False):
    """Returns word after stopword check.
       One must specify the name of the knowledge base.

       @param word: word to be checked
       @type word: str
       @param stopwords_kb: name of the stopwords knowledge base
       @type word: str
    """
    if stopwords_kb:
        stopwords_path = CFG_ETCDIR + "/bibrank/" + stopwords_kb
        if is_stopword(word, stopwords_path):
            return ""
    return word

Example #5

0

Show file

File: bibrank_record_sorter.py Project: AlbertoPeon/invenio

def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
    """Check if the tem is valid for use
    term - the term to check
    col_size - the number of records in database
    term_rec - the number of records which contains this term
    max_occ - max frequency of the term allowed
    min_occ - min frequence of the term allowed
    termlength - the minimum length of the terms allowed"""

    try:
        if is_stopword(term, 1) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
            return ""
        if int(term):
            return ""
    except StandardError, e:
        pass

Example #6

0

Show file

File: bibrank_record_sorter.py Project: epfl-si/invenio-infoscience

def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
    """Check if the tem is valid for use
    term - the term to check
    col_size - the number of records in database
    term_rec - the number of records which contains this term
    max_occ - max frequency of the term allowed
    min_occ - min frequence of the term allowed
    termlength - the minimum length of the terms allowed"""

    try:
        if is_stopword(term, 1) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
            return ""
        if int(term):
            return ""
    except StandardError, e:
        pass

Example #7

0

Show file

File: bibrank_word_searcher.py Project: adsabs/invenio

def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />"
            % rank_method_code
        )

    lwords_old = lwords
    lwords = []
    # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if (
            not methods[rank_method_code]["stopwords"] == "True"
            or methods[rank_method_code]["stopwords"]
            and not is_stopword(term, 1)
        ):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term))
            )
            for term in terms:
                if methods[rank_method_code].has_key("stemmer"):  # stem word
                    term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term:  # add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    # For each term, if accepted, get a list of the records using the term
    # calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)
        )
        if term_recs:  # if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None
            )
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "",
            voutput,
        )
    else:  # sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    # Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  # using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  # using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += (
            "Number of terms: %s<br />"
            % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        )
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)

Example #8

0

Show file

File: bibrank_word_searcher.py Project: pombredanne/invenio-old-2

def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""

    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if methods[rank_method_code].has_key("stemmer"): # stem word
                    term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term: #add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,))
        if term_recs: #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput)
    else: #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)                       #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist      #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)

    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)