Exemple #1
0
def read_documents(top_directory,
                   tokens_only=False,
                   remove_stop_words=False,
                   stop_words=None,
                   speller=None,
                   stemmer=None):
    global file_list
    print('Reading validation documents... ', end='')
    sys.stdout.flush()
    result_list = []
    if remove_stop_words and not stop_words:
        stop_words = get_stop_words('de')
    for root, dirs, files in os.walk(top_directory):
        counter = 0
        files.sort()
        for filename in filter(lambda filename: filename.endswith('.json'),
                               files):
            counter += 1
            filename = os.path.join(root, filename)
            mail = json.load(codecs.open(filename, 'r', 'utf-8'))
            body = mail['body']
            title = mail.get('subject', '')
            fContents = []
            if title != '':
                fContents.append(title)
            fContents.extend(body)
            token_list = []
            for line in fContents:
                line = line.strip().lower()
                tokens = simple_tokenize(line)

                if remove_stop_words is True:
                    stopped_tokens = [i for i in tokens if not i in stop_words]
                else:
                    stopped_tokens = tokens
                stopped_tokens = [
                    token for token in stopped_tokens if len(token) > 1
                ]
                if speller != None:
                    stopped_tokens = speller.getSpellCheckedWordList(
                        stopped_tokens)
                if stemmer != None:
                    stopped_tokens = stemmer.stemWordList(stopped_tokens)
                token_list.extend(stopped_tokens)
            file_list.append(filename)
            if tokens_only:
                result_list.append(token_list)
            else:
                result_list.append(
                    gensim.models.doc2vec.TaggedDocument(
                        token_list, [counter]))
    print('done')
    return result_list
Exemple #2
0
    def getSpellCheckedString(self, text):
        words = simple_tokenize(text)
        spell_checked_text = ''
        for word in words:
            correct_word = word
            if self.checkWordSpelling(word) == False:
                suggested_word = self.suggestWord(word)
                if suggested_word != None and len(suggested_word) > 0:
                    correct_word = suggested_word
            spell_checked_text = spell_checked_text + correct_word + ' '

        spell_checked_text = spell_checked_text.strip()
        return spell_checked_text
Exemple #3
0
def read_all_sentences(top_directory,
                       remove_stop_words=False,
                       stop_words=None,
                       speller=None,
                       stemmer=None):
    sentences = []
    errfile = codecs.open('/tmp/dict-errs.txt', 'w', 'utf-8')
    print('Reading & processing source data... ')
    sys.stdout.flush()
    if remove_stop_words and not stop_words:
        stop_words = get_stop_words('de')
    for root, dirs, files in os.walk(top_directory):
        counter = 0
        files.sort()
        for filename in filter(lambda filename: filename.endswith('.txt'),
                               files):
            counter += 1
            filename = os.path.join(root, filename)
            fContents = codecs.open(filename, 'r', 'utf-8').readlines()
            lineC = len(fContents)
            currL = 0
            for line in fContents:
                if currL % 50 == 0:
                    print('{:-7d}/{:d} Lines processed\r'.format(currL, lineC),
                          end='')
                    sys.stdout.flush()
                currL += 1
                line = unicode(line.strip().lower())
                tokens = simple_tokenize(line)

                if remove_stop_words is True:
                    stopped_tokens = [i for i in tokens if not i in stop_words]
                else:
                    stopped_tokens = tokens
                stopped_tokens = [
                    token for token in stopped_tokens if len(token) > 1
                ]
                if speller != None:
                    stopped_tokens = speller.getSpellCheckedWordList(
                        stopped_tokens)
                if stemmer != None:
                    stopped_tokens = stemmer.stemWordList(stopped_tokens)
                if len(stopped_tokens):
                    sentences.append(stopped_tokens)
    errfile.close()
    print('\ndone')

    return sentences
def read_documents(top_directory, remove_stop_words=False, stop_words=None, speller=None, stemmer=None):
    topics = {}
    if remove_stop_words and not stop_words:
        stop_words = get_stop_words('de')
    for root, dirs, files in os.walk(top_directory):
        counter = 0
        for dir in dirs:
            topics[dir] = []

        for filename in filter(lambda filename: filename.endswith('.json'), files):
            counter += 1
            filename = os.path.join(root, filename)
            mail = json.load(codecs.open(filename, 'r', 'utf-8'))
            body = mail['body']
            title = mail.get('subject', '')
            fContents = []
            if title != '':
                fContents.append(title)
            fContents.extend(body)
            token_list = []
            for line in fContents:
                line = line.strip().lower()
                tokens = simple_tokenize(line)

                if remove_stop_words is True:
                    stopped_tokens = [i for i in tokens if not i in stop_words]
                else:
                    stopped_tokens = tokens
                stopped_tokens = [token for token in stopped_tokens if len(token)>1]
                if speller != None:
                    stopped_tokens = speller.getSpellCheckedWordList(stopped_tokens)
                if stemmer != None:
                    stopped_tokens = stemmer.stemWordList(stopped_tokens)
                token_list.extend(stopped_tokens)
            file_topic = os.path.basename(os.path.dirname(filename))
            if file_topic in topics.keys():
                topics[file_topic].append(token_list)
    return topics
Exemple #5
0
 def addWordsFromText(self, text):
     words = simple_tokenize(text)
     if len(words) > 0:
         for word in words:
             self.speller.add(word)
Exemple #6
0
def analyze_directory(indir,
                      ground_truth_dir,
                      w2v_file,
                      outfile,
                      verbose,
                      outdir,
                      numthreads,
                      lang,
                      remove_stop_words,
                      speller=None,
                      stemmer=None):
    try:
        w2v_model = gensim.models.Word2Vec.load(w2v_file)
    except:
        print('Could not read W2V-File ({})'.format(w2v_file))
        sys.exit(1)

    stop_words = None
    if remove_stop_words:
        stop_words = get_stop_words(lang)

    topic2docid = []
    topics = read_train_docs(ground_truth_dir, remove_stop_words, stop_words,
                             speller, stemmer)

    train_corpus = []
    for key in topics.keys():
        documents = topics[key]
        for document in documents:
            topic2docid.append(key)
            document_tokens = simple_tokenize(document)
            train_corpus.append(document_tokens)

    print('Generating WMD instances... ', end='')
    sys.stdout.flush()
    wmd_instances = []
    for i in range(numthreads):
        wmd_instance = WmdSimilarity(train_corpus, w2v_model,
                                     min(len(train_corpus), 3))
        wmd_instances.append(wmd_instance)

    print('done')
    validate_corpus = read_documents(indir, True, remove_stop_words,
                                     stop_words, speller, stemmer)
    chunk_size = int(len(validate_corpus) / numthreads)
    result_list = {}
    queues = []
    processes = []
    results = []
    for instance in range(numthreads):
        in_q = Queue()
        out_q = Queue()
        queues.append([in_q, out_q])
        if instance == numthreads - 1:
            sp = Process(target=perform_query_on_corpus,
                         args=(wmd_instances[instance],
                               validate_corpus[instance * chunk_size:],
                               instance, len(validate_corpus), numthreads,
                               in_q, out_q))
        else:
            sp = Process(
                target=perform_query_on_corpus,
                args=(wmd_instances[instance],
                      validate_corpus[instance * chunk_size:(instance + 1) *
                                      chunk_size], instance,
                      len(validate_corpus), numthreads, in_q, out_q))
        processes.append(sp)

    for sp in processes:
        sp.start()

    for i, sp in enumerate(processes):
        q = queues[i]
        in_q = q[0]
        out_q = q[1]
        vc_index = out_q.get()
        sims_array = out_q.get()
        in_q.put('THANKS')
        in_q.close()
        sp.join()
        if sims_array != None and len(sims_array) > 0:
            res = []
            for l_docId, sims in enumerate(sims_array):
                docId = (vc_index * chunk_size) + l_docId
                if verbose:
                    print_sim(validate_corpus[docId], sims, train_corpus,
                              topic2docid)
                if sims:
                    result_list[file_list[docId]] = {
                        'sim': sims[0][1],
                        'type': topic2docid[sims[0][0]]
                    }

    print('done analyzing.')
    if isinstance(outfile, basestring):
        json_out_f = codecs.open(outfile, 'w', 'utf-8')
    else:
        json_out_f = outfile
    print(json.dumps(result_list, indent=4), file=json_out_f)
    if isinstance(outfile, basestring):
        json_out_f.close()
    if outdir is not None:
        print('Copying files into subdirectories of {} ... '.format(outdir),
              end='')
        sys.stdout.flush()
        for key in topics.keys():
            if '/' in key:
                key = key.replace('/', '_')
            if ':' in key:
                key = key.replace(':', '_')
            path = os.path.join(outdir, key)
            if not os.path.exists(path):
                os.makedirs(path)

        for source_file in result_list.keys():
            dest_type = result_list[source_file]['type']
            dest_path = os.path.join(outdir, dest_type)
            dest_file = os.path.join(dest_path, os.path.basename(source_file))
            shutil.copyfile(source_file, dest_file)
        print('done')