コード例 #1
0
ファイル: index_search.py プロジェクト: marinapts/ttds
def ranked_retrieval(queries, collection_table, doc_nums, inverted_index,
                     stop_words):
    ranked_scores = {}

    for query_index, query in enumerate(queries):
        query_tokens = normalise(remove_stop_words(tokenise(query),
                                                   stop_words))

        # Convert query into an OR boolean search and use eval to evaluate it
        boolean_vectors = []
        for token in query_tokens:
            boolean_vector = collection_table[token]
            boolean_vectors.append('np.array([{}])'.format(
                array_to_string(boolean_vector)))

        query_eval_string = ' | '.join(boolean_vectors)
        query_documents = boolean_search(query_eval_string, doc_nums)

        query_scores = []
        # Map query_boolean_result to a list of document ids
        for doc in query_documents:
            score = TFIDF(doc, query_tokens, len(doc_nums), inverted_index)
            query_scores.append((doc, score))

        # Sort scores for each query on a descending order
        query_scores = sorted(query_scores, key=lambda x: x[1], reverse=True)
        ranked_scores[query_index + 1] = query_scores

    return ranked_scores
コード例 #2
0
ファイル: tweetnet.py プロジェクト: vasaura/cours-python
 def computerelations(self, graph):
     for tweet in self:
         tokens = preprocess.tokenise(tweet.message)
         for token in tokens:
             #Does this token look like twitter's @recipient syntax ??
             if token and token[0] == '@':
                 user = token[1:]
                 self.addrelation(user)
コード例 #3
0
 def computerelations(self, graph):
     for tweet in self:
         tokens = preprocess.tokenise(tweet.message)
         for token in tokens:
             #Does this token look like twitter's @recipient syntax ??
             if token and token[0] == '@': 
                 user = token[1:]
                 self.addrelation(user)
コード例 #4
0
def get_word_indices_in_text(words, text):
    text_token = tokenise(text)
    word_indices = []

    for word in words:
        word_index = text_token.index(word.lower())
        word_indices.append(word_index + 1)

    return word_indices
コード例 #5
0
def load_docs_from_trec(root):
    # Load the provided trec sample xml
    root = load_xml(TREC_SAMPLE_FILE, './DOC')
    doc_list = []
    token_doc_list = []
    tokenised_docs = {}
    doc_nums = []
    test_list = []

    for doc in root:
        doc_no = doc.find('DOCNO').text
        headline = doc.find('HEADLINE').text
        text = doc.find('TEXT').text
        headline_with_text = headline + ' ' + text

        doc_nums.append(doc_no)
        doc_list.append(headline_with_text)
        test_list.append(tokenise(headline_with_text))
        token_doc_list.append(preprocess(headline_with_text))
        tokenised_docs[doc_no] = preprocess(headline_with_text)

    return doc_nums, token_doc_list, tokenised_docs
コード例 #6
0
def extract_features(filename):
    "Open and tokenise the contents of a file."
    return tokenise(read_corpus_file(filename))
コード例 #7
0
ファイル: ari.py プロジェクト: vasaura/cours-python
def prepare_text(filename):
    return splitsentences(tokenise(readcorpusfile(filename)))
コード例 #8
0
def preprocess(doc):
    return stemming(remove_stop_words(tokenise(doc), stop_words))
コード例 #9
0
def prepare_text(filename):
    return splitsentences(tokenise(readcorpusfile(filename)))