コード例 #1
0
def processEntries( db : couchdb.Database ):
    totalSentences =  [x for x in db.iterview( 'sentences/sentences_count', 10 )][0].value
    sentenceCount = 1
    for entry in db.iterview( 'sentences/sentences', 100 ) :
        if sentenceCount % 1000 == 0 :
            printProgress( sentenceCount, totalSentences )
        for word in sentences.splitInWords( entry.value['sentence'] ) : 
            if word and word != "" :
                updateWordDocument( db, 
                                    word, 
                                    entry.value )
        sentenceCount = sentenceCount + 1
    print("") # to clear printProgress   
コード例 #2
0
def getMinimumSentenceThreshold(db: couchdb.Database,
                                sentences_length_view: str, threshold: int):
    availableLengths = [
        x.key
        for x in db.iterview(sentences_length_view, 2000000, group_level=1)
    ]
    availableLengths.sort(reverse=True)
    logging.info(f'the available lengths per word are {availableLengths}')
    lastElement = round(len(availableLengths) * (threshold / 100))
    return availableLengths[lastElement]
コード例 #3
0
def processEntries(db: couchdb.Database, sleepHourList: list):
    totalSentences = getTotalSentences(
        db, 'sentences/sentences_not_processed_count')
    sentenceCount = 1
    for entry in db.iterview('sentences/sentences_not_processed', 100):
        waitWhileSleepHour(sleepHourList)
        if sentenceCount % 1000 == 0:
            printProgress(sentenceCount, totalSentences)
        wordSet = {word for word in entry.value["word_list"] if word != ""}
        for word in wordSet:
            updateWordDocument(db, word, entry.value)
        setSentenceAsVisited(db, entry.id)
        sentenceCount = sentenceCount + 1
    if sentenceCount > 1:
        print("")  # to clear printProgress
コード例 #4
0
def processEntries(db: couchdb.Database, corpus_result_dir: str,
                   sentenceThreshold):
    words_path = os.path.join(corpus_result_dir, WORDS_TXT)
    discarded_words_path = os.path.join(corpus_result_dir, DISCARDED_WORDS_TXT)
    corpus_path = os.path.join(corpus_result_dir, CORPUS_TXT)
    deleteFileIfExists(words_path)
    deleteFileIfExists(corpus_path)
    with open(words_path, "wt", encoding="utf-8") as words:
        with open(corpus_path, "wt", encoding="utf-8") as corpus:
            with open(discarded_words_path, "wt",
                      encoding="utf-8") as discarded:
                words.write(WORDS_TXT_HEADER)
                corpus.write(CORPUS_TXT_HEADER)
                discarded.write(DISCARDED_WORDS_TXT_HEADER)
                for row in db.iterview('all_words/all_words', 100):
                    if len(row.value['sentences']) >= sentenceThreshold:
                        words.write(f"{row.value['_id']}\n")
                        writeCorpusInfo(db, corpus, row.value)
                    else:
                        discarded.write(f"{row.value['_id']}\n")
コード例 #5
0
def deleteAllWords( db : couchdb.Database ):
    for row in db.iterview( 'all_words/all_words', 100 ) :
        db.delete( row.value )
コード例 #6
0
def getTotalSentences(db: couchdb.Database, totalSentencesView):
    doc = [x for x in db.iterview(totalSentencesView, 10)]
    if len(doc) > 0:
        return doc[0].value
    else:
        return 0
コード例 #7
0
def set_urls_as_not_visited(db: couchdb.Database, not_visited_view):
    for url in db.iterview(not_visited_view, 100):
        urlDoc = db[url.id]
        urlDoc['visited'] = False
        db.save(urlDoc)