def remove_stopwords(word, stopwords_kb=False): """Returns word after stopword check. One must specify the name of the knowledge base. @param word: word to be checked @type word: str @param stopwords_kb: name of the stopwords knowledge base @type word: str """ if stopwords_kb: stopwords_path = CFG_ETCDIR + "/bibrank/" + stopwords_kb if is_stopword(word, stopwords_path): return "" return word
def apply_stemming_and_stopwords_and_length_check(word, stemming_language): """Return WORD after applying stemming and stopword and length checks. See the config file in order to influence these. """ # now check against stopwords: if is_stopword(word): return "" # finally check the word length: if len(word) < CFG_BIBINDEX_MIN_WORD_LENGTH: return "" # stem word, when configured so: if stemming_language: word = stem(word, stemming_language) return word
def remove_stopwords(word, stopwords_kb = False): """Returns word after stopword check. One must specify the name of the knowledge base. @param word: word to be checked @type word: str @param stopwords_kb: name of the stopwords knowledge base @type word: str """ if stopwords_kb: stopwords_path = CFG_ETCDIR + "/bibrank/" + stopwords_kb if is_stopword(word, stopwords_path): return "" return word
def check_term(term, col_size, term_rec, max_occ, min_occ, termlength): """Check if the tem is valid for use term - the term to check col_size - the number of records in database term_rec - the number of records which contains this term max_occ - max frequency of the term allowed min_occ - min frequence of the term allowed termlength - the minimum length of the terms allowed""" try: if is_stopword(term, 1) or (len(term) <= termlength) or ((float(term_rec) / float(col_size)) >= max_occ) or ((float(term_rec) / float(col_size)) <= min_occ): return "" if int(term): return "" except StandardError, e: pass
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += ( "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code ) lwords_old = lwords lwords = [] # Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if ( not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term, 1) ): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split( string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], " ", term)) ) for term in terms: if methods[rank_method_code].has_key("stemmer"): # stem word term = stem(string.replace(term, " ", ""), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: # add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) # For each term, if accepted, get a list of the records using the term # calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql( """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,) ) if term_recs: # if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance( (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None ) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput, ) else: # sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) # Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) # using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist # using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += ( "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] ) voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code lwords_old = lwords lwords = [] #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if not methods[rank_method_code]["stopwords"] == "True" or methods[rank_method_code]["stopwords"] and not is_stopword(term): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split(string.lower(re.sub(methods[rank_method_code]["chars_alphanumericseparators"], ' ', term))) for term in terms: if methods[rank_method_code].has_key("stemmer"): # stem word term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"]) if lwords_old[i] != term: #add if stemmed word is different than original word lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) #For each term, if accepted, get a list of the records using the term #calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql("""SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term,)) if term_recs: #if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance((term, int(term_recs["Gi"][1])) , term_recs, hitset, recdict, rec_termcount, verbose, quick=None) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return (None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) else: #sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) #Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) #using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist #using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % (methods[rank_method_code]["col_size"]) voutput += "Number of terms: %s<br />" % run_sql("SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % (str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)