Esempio n. 1
0
def remove_stopwords(word, stopwords_kb=None):
    """Returns word after stopword check.
       One must specify the name of the knowledge base.

       @param word: word to be checked
       @type word: str
       @param stopwords_kb: name of the stopwords knowledge base
       @type word: str
    """
    if stopwords_kb is not None:
        if is_stopword(word, stopwords_kb):
            return ""
    return word
Esempio n. 2
0
def remove_stopwords(word, stopwords_kb=None):
    """Returns word after stopword check.
       One must specify the name of the knowledge base.

       @param word: word to be checked
       @type word: str
       @param stopwords_kb: name of the stopwords knowledge base
       @type word: str
    """
    if stopwords_kb is not None:
        if is_stopword(word, stopwords_kb):
            return ""
    return word
Esempio n. 3
0
def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
    """Check if the tem is valid for use
    term - the term to check
    col_size - the number of records in database
    term_rec - the number of records which contains this term
    max_occ - max frequency of the term allowed
    min_occ - min frequence of the term allowed
    termlength - the minimum length of the terms allowed"""

    try:
        if is_stopword(term) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
            return ""
        if int(term):
            return ""
    except StandardError as e:
        pass
    return "true"
Esempio n. 4
0
def check_term(term, col_size, term_rec, max_occ, min_occ, termlength):
    """Check if the tem is valid for use
    term - the term to check
    col_size - the number of records in database
    term_rec - the number of records which contains this term
    max_occ - max frequency of the term allowed
    min_occ - min frequence of the term allowed
    termlength - the minimum length of the terms allowed"""

    try:
        if is_stopword(term) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ):
            return ""
        if int(term):
            return ""
    except StandardError:
        pass
    return "true"
Esempio n. 5
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += (
            "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />"
            % rank_method_code
        )

    lwords_old = lwords
    lwords = []
    # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if (
            not methods[rank_method_code]["stopwords"] == "True"
            or methods[rank_method_code]["stopwords"]
            and not is_stopword(term)
        ):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term))
            )
            for term in terms:
                if "stemmer" in methods[rank_method_code]:  # stem word
                    term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"])
                if lwords_old[i] != term:  # add if stemmed word is different than original word
                    lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    # For each term, if accepted, get a list of the records using the term
    # calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)
        )
        if term_recs:  # if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None
            )
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "",
            voutput,
        )
    else:  # sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose)

    # Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  # using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  # using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"])
        voutput += (
            "Number of terms: %s<br />"
            % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0]
        )
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() - startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
Esempio n. 6
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,
                    verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[
                rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(
                    re.sub(
                        methods[rank_method_code]
                        ["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if "stemmer" in methods[rank_method_code]:  # stem word
                    term = stem(string.replace(term, ' ', ''),
                                methods[rank_method_code]["stemmer"])
                if lwords_old[
                        i] != term:  #add if stemmed word is different than original word
                    lwords.append(
                        (term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" %
            methods[rank_method_code]["rnkWORD_table"], (term, ))
        if term_recs:  #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])),
                term_recs,
                hitset,
                recdict,
                rec_termcount,
                verbose,
                quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "", voutput)
    else:  #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount,
                                                  hitset, rank_limit_relevance,
                                                  verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (
            methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql(
            "SELECT count(id) FROM %s" %
            methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (
            str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() -
                                                      startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)