Esempi in Python per Normalizer.normalize_line

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: normalizer

Classe/tipologia: Normalizer

Metodo/funzione: normalize_line

Esempi su hotexamples.com: 1

Normalizer.normalize_line in Python: 1 esempio trovato. Questo è il miglior esempio reale in Python per normalizer.Normalizer.normalize_line, estratto da progetti open source. Lo puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

Normalizer(30)

normalize(18)

update(4)

recompute_stats(4)

read_manifest(4)

get(4)

mono_run_pipeline(4)

get_reverse(3)

transform(3)

normalize_to_stdev(3)

normalize_sentence_info(3)

crop_make_binary_image(3)

fit(3)

denormalize(2)

resize(2)

inverse_transform(2)

run_full_pipeline(2)

timezone_convert_to_est(1)

set_stats(1)

target_path(1)

test_data(1)

get_lemmas(1)

to_binary_image(1)

read_bright_field(1)

training_data(1)

convert_to_float_seconds(1)

calc_mean_and_std(1)

uuid(1)

web_run(1)

to_gray(1)

parse_file(1)

process(1)

normalize_raw(1)

get_stdev(1)

interpolate_skills(1)

missingValues(1)

getTokens(1)

normalize_from_list(1)

normalize_line(1)

normalize_reward(1)

get_stats(1)

normalize_row(1)

add(1)

normalize_suite(1)

extract_section_features(1)

observe(1)

parametrize(1)

zip_code_validation(1)

Esempio n. 1

Mostra file

class Client():
    def __init__(self, host='192.168.0.107', port=7777, list_file='inputfiles-full.txt', freqs_file='wordlist', dataset_dir='/home/aelphy/Desktop/ir_project_dataset'):
        self.normalizer = Normalizer()
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.socket.connect((host, port))
        self.document_freqs_list_filename = freqs_file
        self.document_list_filename = list_file
        self.dataset_dir = dataset_dir

        self.documents_freqs = {}
        self.document_identificators = {}
        self.identificator_documents = {}

        with open(self.document_list_filename) as f:
            for line in f:
                data = line.strip().split()
                index = int(data[0])
                identifier = data[1]
                self.document_identificators[index] = identifier
                self.identificator_documents[identifier] = index

        self.documents_number = index

        with open(self.document_freqs_list_filename) as f:
            for line in f:
                data = line.strip().split()
                self.documents_freqs[self.identificator_documents[data[1]]] = int(data[0])

        self.avgdl = sum(self.documents_freqs.values()) / float(self.documents_number)
        self.k1 = 2.0
        self.b = 0.75


    def process_query(self, query):
        terms = self.normalizer.normalize_line(query)

        if not terms:
            return set()

        inverted_index = {}
        term_doc_tf = {}

        for term in terms:
            self.send_message(term)
            term_doc_tf[term] = self.parse_message_array(self.recieve_message().split())
            inverted_index[term] = term_doc_tf[term].keys()

        documents = self.merge(inverted_index, terms)

        if not documents:
            return set()

        return self.rank(documents, term_doc_tf, terms)


    def recieve_message(self):
        message = ''

        while not message.endswith('\n'):
            message += self.socket.recv(1024).decode('utf-8')

        return message.strip()


    def send_message(self, message):
        self.socket.send((message + '\n').encode('utf-8'))


    def parse_message_array(self, message_array):
        result = {}
        i = 0

        while i <= len(message_array) - 1:
            result[int(message_array[i])] = int(message_array[i + 1])
            i = i + 2

        return result


    def merge(self, inverted_index, terms):
        result = set(inverted_index[terms[0]])

        for i in range(1, len(terms)):
            result = result.intersection(set(inverted_index[terms[i]]))

        return result


    def rank(self, documents, term_doc_tf, terms):
        document_scores = {}
        document_freqs = {}

        for document in documents:
            document_scores[document] = self.score_document(document, term_doc_tf, terms)

        result = sorted(document_scores.items(), key=lambda x: x[1], reverse=True)
        best_match = result[0]
        best_match_file = open(os.path.join(self.dataset_dir, client.document_identificators[best_match[0]]), 'r')
        best_match_data = ' '.join(best_match_file.readlines())

        for term in terms:
            best_match_data = best_match_data.replace(term, color.RED + term + color.END)

        print(best_match_data)
        return result


    def score_document(self, document, term_doc_tf, terms):
        result = 0

        for term in terms:
            IDF = math.log((self.documents_number - len(term_doc_tf[term]) + 0.5) / (len(term_doc_tf[term]) + 0.5))
            f = term_doc_tf[term][document]

            result += IDF * (f * (self.k1 + 1)) / (f + self.k1 * (1 - self.b + self.b * self.documents_freqs[document] / self.avgdl))

        return result