Ejemplo n.º 1
0
class Indexator:
    """
    Contains methods for loading the JSON documents in the corpus
    database.
    """
    SETTINGS_DIR = '../conf'

    def __init__(self):
        f = open(os.path.join(self.SETTINGS_DIR, 'corpus.json'),
                 'r', encoding='utf-8')
        self.settings = json.loads(f.read())
        f.close()
        self.name = self.settings['corpus_name']
        self.languages = self.settings['languages']
        if len(self.languages) <= 0:
            self.languages = [self.name]
        self.input_format = self.settings['input_format']
        self.corpus_dir = os.path.join('../src_convertors/corpus', self.name, 'json')
        self.iterSent = None
        if self.input_format in ['json', 'json-gzip']:
            self.iterSent = JSONDocReader(format=self.input_format)
        self.goodWordFields = ['lex', 'wf', 'wf_display',
                               'parts', 'gloss', 'gloss_index', 'n_ana',
                               'trans_en', 'trans_ru']
        self.AdditionalWordFields = set()
        if 'word_fields' in self.settings:
            self.AdditionalWordFields |= set(self.settings['word_fields'])
        if 'word_table_fields' in self.settings:
            self.AdditionalWordFields |= set(self.settings['word_table_fields'])
        if 'accidental_word_fields' in self.settings:
            self.AdditionalWordFields -= set(self.settings['accidental_word_fields'])
        f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'),
                 'r', encoding='utf-8')
        categories = json.loads(f.read())
        self.goodWordFields += ['gr.' + v for lang in categories
                                for v in categories[lang].values()]
        self.goodWordFields = set(self.goodWordFields)
        f.close()
        self.pd = PrepareData()
        self.es = Elasticsearch()
        self.es_ic = IndicesClient(self.es)
        self.shuffled_ids = [i for i in range(1, 1000000)]
        random.shuffle(self.shuffled_ids)
        self.shuffled_ids.insert(0, 0)    # id=0 is special and should not change
        self.tmpWordIDs = [{} for i in range(len(self.languages))]    # word as JSON -> its integer ID
        self.tmpLemmaIDs = [{} for i in range(len(self.languages))]   # lemma as string -> its integer ID
        self.word2lemma = [{} for i in range(len(self.languages))]    # word's ID -> ID of its lemma (or -1, if none)
        self.wordFreqs = [{} for i in range(len(self.languages))]     # word's ID -> its frequency
        self.wordSFreqs = [{} for i in range(len(self.languages))]    # word's ID -> its number of sentences
        self.wordDocFreqs = [{} for i in range(len(self.languages))]  # (word's ID, dID) -> word frequency in the document
        # self.wordSIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of sentence IDs
        self.wordDIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of document IDs
        self.wfs = set()         # set of word forms (for sorting)
        self.lemmata = set()     # set of lemmata (for sorting)
        self.sID = 0          # current sentence ID for each language
        self.dID = 0          # current document ID
        self.wID = 0          # current word ID
        self.wordFreqID = 0
        self.numWords = 0     # number of words in current document
        self.numSents = 0     # number of sentences in current document
        self.numWordsLang = [0] * len(self.languages)    # number of words in each language in current document
        self.numSentsLang = [0] * len(self.languages)    # number of sentences in each language in current document
        self.totalNumWords = 0

    def delete_indices(self):
        if self.es_ic.exists(index=self.name + '.docs'):
            self.es_ic.delete(index=self.name + '.docs')
        if self.es_ic.exists(index=self.name + '.words'):
            self.es_ic.delete(index=self.name + '.words')
        if self.es_ic.exists(index=self.name + '.word_freqs'):
            self.es_ic.delete(index=self.name + '.word_freqs')
        if self.es_ic.exists(index=self.name + '.sentences'):
            self.es_ic.delete(index=self.name + '.sentences')

    def create_indices(self):
        self.wordMapping = self.pd.generate_words_mapping()
        self.wordFreqMapping = self.pd.generate_wordfreq_mapping()
        self.sentMapping = self.pd.generate_sentences_mapping(self.wordMapping)
        self.docMapping = self.pd.generate_docs_mapping()
        self.es_ic.create(index=self.name + '.docs',
                          body=self.docMapping)
        self.es_ic.create(index=self.name + '.words',
                          body=self.wordMapping)
        self.es_ic.create(index=self.name + '.sentences',
                          body=self.sentMapping)

    def randomize_id(self, realID):
        """
        Return a (relatively) randomized sentence ID. This randomization
        is needed in context-aware word queries where the sentences
        are iterated in the order determined by their IDs.
        """
        if realID < 0:
            return realID
        idStart, idEnd = realID // 1000000, realID % 1000000
        return idStart * 1000000 + self.shuffled_ids[idEnd]

    def enhance_word(self, word):
        """
        Add some calculated fields to the JSON word.
        """
        if 'ana' not in word:
            word['n_ana'] = 0
        else:
            word['n_ana'] = len(word['ana'])
            if word['n_ana'] >= 127:
                word['n_ana'] = 127

    def process_sentence_words(self, words, langID):
        """
        Take words list from a sentence, remove all non-searchable
        fields from them and add them to self.words dictionary.
        Add w_id and l_id properties to each word of the words list.
        Return the value of the 'sent_analyzed' meta field.
        """
        sIDAdded = set()   # word IDs for which the current settence ID has been counted for it
        bUniquelyAnalyzed = True
        bFullyAnalyzed = True
        for w in words:
            if w['wtype'] != 'word':
                continue
            self.numWords += 1
            self.numWordsLang[langID] += 1
            self.totalNumWords += 1
            self.enhance_word(w)
            wClean = {'lang': langID}
            lemma = ''
            for field in w:
                if field in self.goodWordFields or field in self.AdditionalWordFields:
                    wClean[field] = w[field]
                    if field == 'wf':
                        if 'wf_lowercase' not in self.settings or self.settings['wf_lowercase']:
                            wClean[field] = wClean[field].lower()
                        self.wfs.add(wClean[field])
            if 'ana' in w:
                if len(w['ana']) > 1:
                    bUniquelyAnalyzed = False
                lemma = self.get_lemma(w)
                self.lemmata.add(lemma)
                wClean['ana'] = []
                for ana in w['ana']:
                    cleanAna = {}
                    for anaField in ana:
                        if anaField in self.goodWordFields or anaField in self.AdditionalWordFields:
                            cleanAna[anaField] = ana[anaField]
                    wClean['ana'].append(cleanAna)
            if 'ana' not in w or len(w['ana']) <= 0:
                bFullyAnalyzed = False
                bUniquelyAnalyzed = False
            wCleanTxt = json.dumps(wClean, ensure_ascii=False, sort_keys=True)
            if wCleanTxt in self.tmpWordIDs[langID]:
                wID = self.tmpWordIDs[langID][wCleanTxt]
            else:
                wID = sum(len(self.tmpWordIDs[i]) for i in range(len(self.languages)))
                self.tmpWordIDs[langID][wCleanTxt] = wID
            w['w_id'] = wID
            if len(lemma) > 0:
                try:
                    lemmaID = self.tmpLemmaIDs[langID][lemma]
                except KeyError:
                    lemmaID = sum(len(self.tmpLemmaIDs[i])
                                  for i in range(len(self.languages))) + 1
                    self.tmpLemmaIDs[langID][lemma] = lemmaID
                self.word2lemma[langID][wID] = lemmaID
                w['l_id'] = lemmaID
            try:
                self.wordFreqs[langID][wID] += 1
            except KeyError:
                self.wordFreqs[langID][wID] = 1
            if wID not in sIDAdded:
                sIDAdded.add(wID)
                try:
                    self.wordSFreqs[langID][wID] += 1
                except KeyError:
                    self.wordSFreqs[langID][wID] = 1
            try:
                self.wordDIDs[langID][wID].add(self.dID)
            except KeyError:
                self.wordDIDs[langID][wID] = {self.dID}
            try:
                self.wordDocFreqs[langID][(wID, self.dID)] += 1
            except KeyError:
                self.wordDocFreqs[langID][(wID, self.dID)] = 1
        if not bFullyAnalyzed:
            return 'incomplete'
        if not bUniquelyAnalyzed:
            return 'complete'
        return 'unique'

    def make_sorting_function(self, lang):
        """
        Return a function that can be used for sorting tokens
        in a list according to the alphabetical ordering specified
        for the language lang.
        """
        sortingFunction = lambda x: x
        if lang in self.settings['lang_props'] and 'lexicographic_order' in self.settings['lang_props'][lang]:
            dictSort = {self.settings['lang_props'][lang]['lexicographic_order'][i]:
                            (i, self.settings['lang_props'][lang]['lexicographic_order'][i])
                        for i in range(len(self.settings['lang_props'][lang]['lexicographic_order']))}
            maxIndex = len(dictSort)

            def charReplaceFunction(c):
                if c in dictSort:
                    return dictSort[c]
                return (maxIndex, c)

            sortingFunction = lambda x: [charReplaceFunction(c) for c in x.lower()]
        return sortingFunction

    def sort_words(self, lang):
        """
        Sort word forms and lemmata stored at earlier stages.
        Return dictionaries with positions of word forms and
        lemmata in the sorted list.
        If there is a custom alphabetical order for the language,
        use it. Otherwise, use standard lexicographic sorting.
        """
        wfsSorted = {}
        iOrder = 0
        sortingFunction = self.make_sorting_function(lang)
        for wf in sorted(self.wfs, key=sortingFunction):
            wfsSorted[wf] = iOrder
            iOrder += 1
        lemmataSorted = {}
        iOrder = 0
        for l in sorted(self.lemmata, key=sortingFunction):
            lemmataSorted[l] = iOrder
            iOrder += 1
        return wfsSorted, lemmataSorted

    def get_freq_ranks(self, freqsSorted):
        """
        Calculate frequency ranks and rank/quantile labels for words
        or lemmata.
        """
        freqToRank = {}
        quantiles = {}
        prevFreq = 0
        prevRank = 0
        for i in range(len(freqsSorted)):
            v = freqsSorted[i]
            if v != prevFreq:
                if prevFreq != 0:
                    freqToRank[prevFreq] = prevRank + (i - prevRank) // 2
                prevRank = i
                prevFreq = v
        if prevFreq != 0:
            freqToRank[prevFreq] = prevRank + (len(freqsSorted) - prevRank) // 2
        for q in [0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]:
            qIndex = math.ceil(q * len(freqsSorted))
            if qIndex >= len(freqsSorted):
                qIndex = len(freqsSorted) - 1
            if qIndex >= 0:
                quantiles[q] = freqsSorted[qIndex]
            else:
                quantiles[q] = 0
        return freqToRank, quantiles

    def quantile_label(self, freq, rank, quantiles):
        """
        Return a string label of the frequency rank (for frequent items)
        or quantile. This label is showed to the user in word query results.
        """
        if freq > 1 and freq >= quantiles[0.5]:
            if freq > quantiles[0.03]:
                return '#' + str(rank + 1)
            else:
                return '&gt; ' + str(min(math.ceil(q * 100) for q in quantiles
                                     if freq >= quantiles[q])) + '%'
        return ''

    def get_lemma(self, word, lower_lemma=True):
        """
        Join all lemmata in the JSON representation of a word with
        an analysis and return them as a string.
        """
        if 'ana' not in word:
            return ''
        if 'keep_lemma_order' not in self.settings or not self.settings['keep_lemma_order']:
            curLemmata = set()
            for ana in word['ana']:
                if 'lex' in ana:
                    if type(ana['lex']) == list:
                        for l in ana['lex']:
                            lAdd = l
                            if lower_lemma:
                                lAdd = lAdd.lower()
                            curLemmata.add(lAdd)
                    else:
                        lAdd = ana['lex']
                        if lower_lemma:
                            lAdd = lAdd.lower()
                        curLemmata.add(lAdd)
            return '/'.join(l for l in sorted(curLemmata))
        curLemmata = []
        for ana in word['ana']:
            if 'lex' in ana:
                if type(ana['lex']) == list:
                    for l in ana['lex']:
                        lAdd = l
                        if lower_lemma:
                            lAdd = lAdd.lower()
                        curLemmata.append(lAdd)
                else:
                    lAdd = ana['lex']
                    if lower_lemma:
                        lAdd = lAdd.lower()
                    curLemmata.append(lAdd)
        return '/'.join(curLemmata)

    def get_grdic(self, word, lang):
        """
        Join all dictionary grammar tags strings in the JSON representation of a word with
        an analysis and return them as a string.
        """
        if 'ana' not in word:
            return ''
        curGramm = set()
        translations = set()
        for ana in word['ana']:
            grTags = ''
            if 'gr.pos' in ana:
                value = ana['gr.pos']
                if type(value) == list:
                    value = ', '.join(value)
                grTags = value
            for field in sorted(ana):
                value = ana[field]
                if type(value) == list:
                    value = ', '.join(value)
                if ('lang_props' in self.settings
                        and lang in self.settings['lang_props']
                        and 'dictionary_categories' in self.settings['lang_props'][lang]
                        and field.startswith('gr.')
                        and field[3:] in self.settings['lang_props'][lang]['dictionary_categories']):
                    if len(grTags) > 0:
                        grTags += ', '
                    grTags += value
                elif field.startswith('trans_'):
                    translations.add(value)
            if len(grTags) > 0:
                curGramm.add(grTags)
        return ' | '.join(grdic for grdic in sorted(curGramm)), ' | '.join(tr for tr in sorted(translations))

    def iterate_lemmata(self, langID, lemmaFreqs, lemmaDIDs):
        """
        Iterate over all lemmata for one language collected at the
        word iteration stage.
        """
        lFreqsSorted = [v for v in sorted(lemmaFreqs.values(), reverse=True)]
        freqToRank, quantiles = self.get_freq_ranks(lFreqsSorted)
        iLemma = 0
        for l, lID in self.tmpLemmaIDs[langID].items():
            if iLemma % 250 == 0:
                print('indexing lemma', iLemma)
            lemmaJson = {'wf': l,
                         'freq': lemmaFreqs[lID],
                         'rank_true': freqToRank[lemmaFreqs[lID]],
                         'rank': self.quantile_label(lemmaFreqs[lID],
                                                     freqToRank[lemmaFreqs[lID]],
                                                     quantiles),
                         'n_docs': len(lemmaDIDs[lID])}
            curAction = {'_index': self.name + '.words',
                         '_type': 'lemma',
                         '_id': lID,
                         '_source': lemmaJson}
            iLemma += 1
            yield curAction

    def iterate_words(self):
        """
        Iterate through all words collected at the previous
        stage. Return JSON objects with actions for bulk indexing
        in Elasticsearch.
        """
        self.wID = 0

        for langID in range(len(self.languages)):
            wfsSorted, lemmataSorted = self.sort_words(self.languages[langID])
            iWord = 0
            print('Processing words in ' + self.languages[langID] + '...')
            lemmaFreqs = {}       # lemma ID -> its frequency
            lemmaDIDs = {}        # lemma ID -> its document IDs
            wFreqsSorted = [v for v in sorted(self.wordFreqs[langID].values(), reverse=True)]
            freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted)
            # for wID in self.wordFreqs[langID]:
            for w, wID in self.tmpWordIDs[langID].items():
                if iWord % 500 == 0:
                    print('indexing word', iWord)
                try:
                    lID = self.word2lemma[langID][wID]
                except KeyError:
                    lID = 0
                wJson = json.loads(w)
                wfOrder = len(wfsSorted) + 1
                if 'wf' in wJson:
                    wfOrder = wfsSorted[wJson['wf']]
                lOrder = len(lemmataSorted) + 1
                if 'ana' in wJson:
                    lOrder = lemmataSorted[self.get_lemma(wJson)]
                wJson['wf_order'] = wfOrder
                wJson['l_order'] = lOrder
                wordFreq = self.wordFreqs[langID][wID]
                wJson['freq'] = wordFreq
                try:
                    lemmaFreqs[lID] += wordFreq
                except KeyError:
                    lemmaFreqs[lID] = wordFreq
                if lID != 0:
                    try:
                        lemmaDIDs[lID] |= self.wordDIDs[langID][wID]
                    except KeyError:
                        lemmaDIDs[lID] = set(self.wordDIDs[langID][wID])
                # wJson['sids'] = [sid for sid in sorted(self.wordSIDs[langID][wID])]
                wJson['dids'] = [did for did in sorted(self.wordDIDs[langID][wID])]
                wJson['n_sents'] = self.wordSFreqs[langID][wID]
                wJson['n_docs'] = len(wJson['dids'])
                wJson['rank_true'] = freqToRank[wJson['freq']]  # for the calculations
                wJson['rank'] = self.quantile_label(wJson['freq'],
                                                    wJson['rank_true'],
                                                    quantiles)  # for the user
                curAction = {'_index': self.name + '.words',
                             '_type': 'word',
                             '_id': wID,
                             '_source': wJson,
                             '_parent': lID}
                yield curAction

                for docID in wJson['dids']:
                    wfreqJson = {'w_id': wID,
                                 'd_id': docID,
                                 'wf_order': wfOrder,
                                 'l_order': lOrder,
                                 'freq': self.wordDocFreqs[langID][(wID, docID)]}
                    curAction = {'_index': self.name + '.words',
                                 '_type': 'word_freq',
                                 '_id': self.wordFreqID,
                                 '_source': wfreqJson,
                                 '_parent': wID,
                                 '_routing': lID}
                    self.wordFreqID += 1
                    yield curAction
                iWord += 1
                self.wID += 1
            for lAction in self.iterate_lemmata(langID, lemmaFreqs, lemmaDIDs):
                yield lAction
        emptyLemmaJson = {'wf': 0,
                          'freq': 0,
                          'rank_true': -1}
        curAction = {'_index': self.name + '.words',
                     '_type': 'lemma',
                     '_id': 0,
                     '_source': emptyLemmaJson}
        yield curAction
        self.wfs = None
        self.lemmata = None


    def generate_dictionary(self, fnameOut):
        """
        For each language, print out an HTML dictionary containing all lexemes of the corpus.
        """
        for langID in range(len(self.languages)):
            iWord = 0
            print('Generating dictionary for ' + self.languages[langID] + '...')
            lexFreqs = {}       # lemma ID -> its frequency
            wFreqsSorted = [v for v in sorted(self.wordFreqs[langID].values(), reverse=True)]
            freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted)
            # for wID in self.wordFreqs[langID]:
            for w, wID in self.tmpWordIDs[langID].items():
                if iWord % 1000 == 0:
                    print('processing word', iWord, 'for the dictionary')
                iWord += 1
                wJson = json.loads(w)
                if 'ana' not in wJson or len(wJson['ana']) <= 0:
                    continue
                lemma = self.get_lemma(wJson, lower_lemma=False)
                grdic, translations = self.get_grdic(wJson, self.languages[langID])
                wordFreq = self.wordFreqs[langID][wID]
                lexTuple = (lemma, grdic, translations)
                if lexTuple not in lexFreqs:
                    lexFreqs[lexTuple] = wordFreq
                else:
                    lexFreqs[lexTuple] += wordFreq
            if len(lexFreqs) <= 0:
                continue

            fOut = open(os.path.join('../search/web_app/templates', 'dictionary_' + self.settings['corpus_name']
                                     + '_' + self.languages[langID] + '.html'), 'w', encoding='utf-8')
            fOut.write('<h1 class="dictionary_header"> {{ _(\'Dictionary_header\') }} '
                       '({{ _(\'langname_' + self.languages[langID] + '\') }})</h1>\n')
            prevLetter = ''
            sortingFunction = self.make_sorting_function(self.settings['languages'][langID])
            for lemma, grdic, trans in sorted(lexFreqs, key=lambda x: (sortingFunction(x[0].lower()), -lexFreqs[x])):
                if len(lemma) <= 0:
                    continue
                curLetter = lemma.lower()[0]
                if curLetter != prevLetter:
                    if prevLetter != '':
                        fOut.write('</tbody>\n</table>\n')
                    fOut.write('<h2 class="dictionary_letter">' + curLetter.upper() + '</h2>\n')
                    fOut.write('<table class="dictionary_table">\n<thead>\n'
                               '<th>{{ _(\'word_th_lemma\') }}</th>'
                               '<th>{{ _(\'word_th_gr\') }}</th>'
                               '<th>{{ _(\'word_th_trans_en\') }}</th>'
                               '<th>{{ _(\'word_th_frequency\') }}</th>'
                               '</thead>\n<tbody>\n')
                    prevLetter = curLetter
                fOut.write('<tr>\n<td class="dictionary_lemma">' + lemma + '</td><td>' + grdic + '</td>'
                           '<td>' + trans + '</td><td>'
                           + str(lexFreqs[(lemma, grdic, trans)]) + '</td></tr>\n')
            if prevLetter != '':
                fOut.write('</tbody>\n</table>\n')
            fOut.close()

    def index_words(self):
        """
        Index all words that have been collected at the previous stage
        in self.words (while the sentences were being indexed).
        """
        bulk(self.es, self.iterate_words(), chunk_size=300, request_timeout=60)
        if 'generate_dictionary' in self.settings and self.settings['generate_dictionary']:
            self.generate_dictionary(os.path.join(self.corpus_dir, 'dictionary.html'))

    def add_parallel_sids(self, sentences, paraIDs):
        """
        In the parallel corpus, add the IDs of aligned sentences in other languages
        to each sentence that has a para_id.
        """
        for s in sentences:
            if 'para_alignment' not in s['_source'] or 'lang' not in s['_source']:
                continue
            langID = s['_source']['lang']
            for pa in s['_source']['para_alignment']:
                paraID = pa['para_id']
                pa['sent_ids'] = []
                for i in range(len(self.languages)):
                    if i == langID:
                        continue
                    if paraID in paraIDs[i]:
                        pa['sent_ids'] += paraIDs[i][paraID]

    def iterate_sentences(self, fname):
        self.numSents = 0
        prevLast = False
        sentences = []
        paraIDs = [{} for i in range(len(self.languages))]
        for s, bLast in self.iterSent.get_sentences(fname):
            if 'lang' in s:
                langID = s['lang']
            else:
                langID = 0
                s['lang'] = langID
            s['n_words'] = 0
            if 'words' in s:
                sentAnaMeta = self.process_sentence_words(s['words'], langID)
                s['n_words'] = sum(1 for w in s['words'] if 'wtype' in w and w['wtype'] == 'word')
                if 'meta' not in s:
                    s['meta'] = {}
                s['meta']['sent_analyses'] = sentAnaMeta
            if prevLast:
                prevLast = False
            elif self.numSents > 0:
                s['prev_id'] = self.randomize_id(self.sID - 1)
            if not bLast and 'last' not in s:
                s['next_id'] = self.randomize_id(self.sID + 1)
            else:
                prevLast = True
            s['doc_id'] = self.dID
            if 'meta' in s:
                for metaField in [mf for mf in s['meta'].keys() if not (mf.startswith('year') or mf.endswith('_kw'))]:
                    s['meta'][metaField + '_kw'] = s['meta'][metaField]
            # self.es.index(index=self.name + '.sentences',
            #               doc_type='sentence',
            #               id=self.sID,
            #               body=s)
            curAction = {'_index': self.name + '.sentences',
                         '_type': 'sentence',
                         '_id': self.randomize_id(self.sID),
                         '_source': s}
            if len(self.languages) <= 1:
                yield curAction
            else:
                sentences.append(curAction)
                if 'para_alignment' in s:
                    s['para_ids'] = []
                    for pa in s['para_alignment']:
                        paraID = str(self.dID) + '_' + str(pa['para_id'])
                        pa['para_id'] = paraID
                        s['para_ids'].append(paraID)
                        try:
                            paraIDs[langID][paraID].append(self.randomize_id(self.sID))
                        except KeyError:
                            paraIDs[langID][paraID] = [self.randomize_id(self.sID)]
            if self.sID % 500 == 0:
                print('Indexing sentence', self.sID, ',', self.totalNumWords, 'words so far.')
            self.numSents += 1
            self.numSentsLang[langID] += 1
            self.sID += 1
        if len(self.languages) > 1:
            self.add_parallel_sids(sentences, paraIDs)
            for s in sentences:
                yield s

    @staticmethod
    def add_meta_keywords(meta):
        """
        For each text field in the metadata, add a keyword version
        of the same field.
        """
        for field in [k for k in meta.keys() if not k.startswith('year')]:
            meta[field + '_kw'] = meta[field]

    def index_doc(self, fname):
        """
        Store the metadata of the source file.
        """
        if self.dID % 100 == 0:
            print('Indexing document', self.dID)
        meta = self.iterSent.get_metadata(fname)
        self.add_meta_keywords(meta)
        meta['n_words'] = self.numWords
        meta['n_sents'] = self.numSents
        if len(self.settings['languages']) > 1:
            for i in range(len(self.languages)):
                meta['n_words_' + self.languages[i]] = self.numWordsLang[i]
                meta['n_sents_' + self.languages[i]] = self.numSentsLang[i]
        self.numWords = 0
        self.numSents = 0
        self.numWordsLang = [0] * len(self.languages)
        self.numSentsLang = [0] * len(self.languages)
        try:
            self.es.index(index=self.name + '.docs',
                          doc_type='doc',
                          id=self.dID,
                          body=meta)
        except RequestError as err:
            print('Metadata error: {0}'.format(err))
            shortMeta = {}
            if 'filename' in meta:
                shortMeta['filename'] = meta['filename']
            if 'title' in meta:
                shortMeta['title'] = meta['title']
                shortMeta['title_kw'] = meta['title']
                self.es.index(index=self.name + '.docs',
                              doc_type='doc',
                              id=self.dID,
                              body=shortMeta)
        self.dID += 1

    def index_dir(self):
        """
        Index all files from the corpus directory, sorted by their size
        in decreasing order. Such sorting helps prevent memory errors
        when indexing large corpora, as the default behavior is to load
        the whole file is into memory, and there is more free memory
        in the beginning of the process. If MemoryError occurs, the
        iterative JSON parser is used, which works much slower.
        """
        filenames = []
        for root, dirs, files in os.walk(self.corpus_dir):
            for fname in files:
                if (not ((self.settings['input_format'] == 'json'
                          and fname.lower().endswith('.json'))
                         or (self.settings['input_format'] == 'json-gzip'
                             and fname.lower().endswith('.json.gz')))):
                    continue
                fnameFull = os.path.join(root, fname)
                filenames.append((fnameFull, os.path.getsize(fnameFull)))
        if len(filenames) <= 0:
            print('There are no files in this corpus.')
            return
        for fname, fsize in sorted(filenames, key=lambda p: -p[1]):
            # print(fname, fsize)
            bulk(self.es, self.iterate_sentences(fname), chunk_size=200, request_timeout=60)
            self.index_doc(fname)
        self.index_words()

    def compile_translations(self):
        """
        Compile flask_babel translations in ../search/web_app.
        """
        pythonPath = ''
        for p in sys.path:
            if re.search('Python3[^/\\\\]*[/\\\\]?$', p) is not None:
                pythonPath = p
                break
        if len(pythonPath) <= 0:
            pyBabelPath = 'pybabel'
        else:
            pyBabelPath = os.path.join(pythonPath, 'Scripts', 'pybabel')
        try:
            subprocess.run([pyBabelPath, 'compile',  '-d', 'translations'], cwd='../search/web_app', check=True)
        except:
            print('Could not compile translations with ' + pyBabelPath + ' .')
        else:
            print('Interface translations compiled.')

    def load_corpus(self):
        """
        Drop the current database, if any, and load the entire corpus.
        """
        t1 = time.time()
        self.compile_translations()
        self.delete_indices()
        self.create_indices()
        self.index_dir()
        t2 = time.time()
        print('Corpus indexed in', t2-t1, 'seconds:',
              self.dID, 'documents,',
              self.sID, 'sentences,',
              self.totalNumWords, 'words,',
              sum(len(self.wordFreqs[i]) for i in range(len(self.languages))), 'word types (different words).')
Ejemplo n.º 2
0
class Indexator:
    """
    Contains methods for loading the JSON documents in the corpus
    database.
    """
    SETTINGS_DIR = '../conf'

    def __init__(self):
        f = open(os.path.join(self.SETTINGS_DIR, 'corpus.json'),
                 'r',
                 encoding='utf-8')
        self.settings = json.loads(f.read())
        f.close()
        self.name = self.settings['corpus_name']
        self.languages = self.settings['languages']
        if len(self.languages) <= 0:
            self.languages = [self.name]
        self.input_format = self.settings['input_format']
        self.corpus_dir = os.path.join('../', self.name)
        print(os.path.abspath(self.corpus_dir))
        self.iterSent = None
        if self.input_format in ['json', 'json-gzip']:
            self.iterSent = JSONDocReader(format=self.input_format)
        self.goodWordFields = ['lex', 'wf', 'ana', 'trans_en', 'trans_ru']
        self.AdditionalWordFields = []
        if 'word_fields' in self.settings:
            self.AdditionalWordFields = self.settings['word_fields']
        f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'),
                 'r',
                 encoding='utf-8')
        categories = json.loads(f.read())
        self.goodWordFields += [
            'gr.' + v for lang in categories
            for v in categories[lang].values()
        ]
        self.goodWordFields = set(self.goodWordFields)
        f.close()
        self.pd = PrepareData()
        self.es = Elasticsearch()
        self.es_ic = IndicesClient(self.es)
        self.shuffled_ids = [i for i in range(1, 1000000)]
        random.shuffle(self.shuffled_ids)
        self.shuffled_ids.insert(0, 0)
        # id=0 is special and should not change
        self.tmpWordIDs = [{} for i in range(len(self.languages))]
        # word as JSON -> its integer ID
        self.tmpLemmaIDs = [{} for i in range(len(self.languages))]
        # lemma as string -> its integer ID
        self.word2lemma = [{} for i in range(len(self.languages))]
        # word's ID -> ID of its lemma (or -1, if none)
        self.wordFreqs = [{} for i in range(len(self.languages))]
        # word's ID -> its frequency
        self.wordSFreqs = [{} for i in range(len(self.languages))]
        # word's ID -> its number of sentences
        self.wordDocFreqs = [{} for i in range(len(self.languages))]
        # (word's ID, dID) -> word frequency in the document
        # self.wordSIDs = [{} for i in range(len(self.languages))]
        # word's ID -> set of sentence IDs
        self.wordDIDs = [{} for i in range(len(self.languages))]
        # word's ID -> set of document IDs
        self.wfs = set()  # set of word forms (for sorting)
        self.lemmata = set()  # set of lemmata (for sorting)
        self.sID = 0  # current sentence ID for each language
        self.dID = 0  # current document ID
        self.wID = 0  # current word ID
        self.wordFreqID = 0
        self.numWords = 0  # number of words in current document
        self.numSents = 0  # number of sentences in current document
        self.numWordsLang = [0] * len(self.languages)
        # number of words in each language in current document
        self.numSentsLang = [0] * len(self.languages)
        # number of sentences in each language in current document
        self.totalNumWords = 0

    def delete_indices(self):
        if self.es_ic.exists(index=self.name + '.docs'):
            self.es_ic.delete(index=self.name + '.docs')
        if self.es_ic.exists(index=self.name + '.words'):
            self.es_ic.delete(index=self.name + '.words')
        if self.es_ic.exists(index=self.name + '.word_freqs'):
            self.es_ic.delete(index=self.name + '.word_freqs')
        if self.es_ic.exists(index=self.name + '.sentences'):
            self.es_ic.delete(index=self.name + '.sentences')

    def create_indices(self):
        self.wordMapping = self.pd.generate_words_mapping()
        self.wordFreqMapping = self.pd.generate_wordfreq_mapping()
        self.sentMapping = self.pd.generate_sentences_mapping(self.wordMapping)
        self.docMapping = self.pd.generate_docs_mapping()
        self.es_ic.create(index=self.name + '.docs', body=self.docMapping)
        self.es_ic.create(index=self.name + '.words', body=self.wordMapping)
        self.es_ic.create(index=self.name + '.sentences',
                          body=self.sentMapping)

    def randomize_id(self, realID):
        """
        Return a (relatively) randomized sentence ID. This randomization
        is needed in context-aware word queries where the sentences
        are iterated in the order determined by their IDs.
        """
        if realID < 0:
            return realID
        idStart, idEnd = realID // 1000000, realID % 1000000
        return idStart * 1000000 + self.shuffled_ids[idEnd]

    def enhance_word(self, word):
        """
        Add some calculated fields to the JSON word.
        """
        if 'ana' not in word:
            word['n_ana'] = 0
        else:
            word['n_ana'] = len(word['ana'])
            if word['n_ana'] >= 127:
                word['n_ana'] = 127

    def process_sentence_words(self, words, langID):
        """
        Take words list from a sentence, remove all non-searchable
        fields from them and add them to self.words dictionary.
        Add w_id property to each word of the words list.
        """
        sIDAdded = set()
        # word IDs for which the current settence ID has been counted for it
        for w in words:
            if w['wtype'] != 'word':
                continue
            self.numWords += 1
            self.numWordsLang[langID] += 1
            self.totalNumWords += 1
            self.enhance_word(w)
            wClean = {'lang': langID}
            lemma = ''
            for field in w:
                if field in self.goodWordFields:
                    wClean[field] = w[field]
                    if field == 'wf':
                        wClean[field] = wClean[field].lower()
                        self.wfs.add(wClean[field])
            if 'ana' in w:
                lemma = self.get_lemma(w)
                self.lemmata.add(lemma)
                wClean['ana'] = []
                for ana in w['ana']:
                    cleanAna = {}
                    for anaField in ana:
                        if anaField in self.goodWordFields:
                            cleanAna[anaField] = ana[anaField]
                    wClean['ana'].append(cleanAna)
            wCleanTxt = json.dumps(wClean, ensure_ascii=False, sort_keys=True)
            if wCleanTxt in self.tmpWordIDs[langID]:
                wID = self.tmpWordIDs[langID][wCleanTxt]
            else:
                wID = sum(
                    len(self.tmpWordIDs[i])
                    for i in range(len(self.languages)))
                self.tmpWordIDs[langID][wCleanTxt] = wID
            w['w_id'] = wID
            if len(lemma) > 0:
                try:
                    lemmaID = self.tmpLemmaIDs[langID][lemma]
                except KeyError:
                    lemmaID = sum(
                        len(self.tmpLemmaIDs[i])
                        for i in range(len(self.languages))) + 1
                    self.tmpLemmaIDs[langID][lemma] = lemmaID
                self.word2lemma[langID][wID] = lemmaID

            try:
                self.wordFreqs[langID][wID] += 1
            except KeyError:
                self.wordFreqs[langID][wID] = 1
            if wID not in sIDAdded:
                sIDAdded.add(wID)
                try:
                    self.wordSFreqs[langID][wID] += 1
                except KeyError:
                    self.wordSFreqs[langID][wID] = 1
            try:
                self.wordDIDs[langID][wID].add(self.dID)
            except KeyError:
                self.wordDIDs[langID][wID] = {self.dID}
            try:
                self.wordDocFreqs[langID][(wID, self.dID)] += 1
            except KeyError:
                self.wordDocFreqs[langID][(wID, self.dID)] = 1

    def sort_words(self):
        """
        Sort word forms and lemmata stored at earlier stages.
        Return dictionaries with positions of word forms and
        lemmata in the sorted list. Delete the original lists.
        """
        wfsSorted = {}
        iOrder = 0
        for wf in sorted(self.wfs):
            wfsSorted[wf] = iOrder
            iOrder += 1
        self.wfs = None
        lemmataSorted = {}
        iOrder = 0
        for l in sorted(self.lemmata):
            lemmataSorted[l] = iOrder
            iOrder += 1
        self.lemmata = None
        return wfsSorted, lemmataSorted

    def get_freq_ranks(self, freqsSorted):
        """
        Calculate frequency ranks and rank/quantile labels for words
        or lemmata.
        """
        freqToRank = {}
        quantiles = {}
        prevFreq = 0
        prevRank = 0
        for i in range(len(freqsSorted)):
            v = freqsSorted[i]
            if v != prevFreq:
                if prevFreq != 0:
                    freqToRank[prevFreq] = prevRank + (i - prevRank) // 2
                prevRank = i
                prevFreq = v
        if prevFreq != 0:
            freqToRank[prevFreq] = (prevRank +
                                    (len(freqsSorted) - prevRank) // 2)
        for q in [0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]:
            qIndex = math.ceil(q * len(freqsSorted))
            if qIndex >= len(freqsSorted):
                qIndex = len(freqsSorted) - 1
            quantiles[q] = freqsSorted[qIndex]
        return freqToRank, quantiles

    def quantile_label(self, freq, rank, quantiles):
        """
        Return a string label of the frequency rank (for frequent items)
        or quantile. This label is showed to the user in word query results.
        """
        if freq > 1 and freq >= quantiles[0.5]:
            if freq > quantiles[0.03]:
                return '#' + str(rank + 1)
            else:
                return '&gt; ' + str(
                    min(
                        math.ceil(q * 100)
                        for q in quantiles if freq >= quantiles[q])) + '%'
        return ''

    def get_lemma(self, word):
        """
        Join all lemmata in the JSON representation of a word with
        an analysis and return them as a string.
        """
        if 'ana' not in word:
            return ''
        curLemmata = set()
        for ana in word['ana']:
            if 'lex' in ana:
                curLemmata.add(ana['lex'].lower())
        return '/'.join(l for l in sorted(curLemmata))

    def iterate_lemmata(self, langID, lemmaFreqs, lemmaDIDs):
        """
        Iterate over all lemmata for one language collected at the
        word iteration stage.
        """
        lFreqsSorted = [v for v in sorted(lemmaFreqs.values(), reverse=True)]
        freqToRank, quantiles = self.get_freq_ranks(lFreqsSorted)
        iLemma = 0
        for l, lID in self.tmpLemmaIDs[langID].items():
            if iLemma % 250 == 0:
                print('indexing lemma', iLemma)
            lemmaJson = {
                'wf':
                l,
                'freq':
                lemmaFreqs[lID],
                'rank_true':
                freqToRank[lemmaFreqs[lID]],
                'rank':
                self.quantile_label(lemmaFreqs[lID],
                                    freqToRank[lemmaFreqs[lID]], quantiles),
                'n_docs':
                len(lemmaDIDs[lID])
            }
            curAction = {
                '_index': self.name + '.words',
                '_type': 'lemma',
                '_id': lID,
                '_source': lemmaJson
            }
            iLemma += 1
            yield curAction

    def iterate_words(self):
        """
        Iterate through all words collected at the previous
        stage. Return JSON objects with actions for bulk indexing
        in Elasticsearch.
        """
        self.wID = 0
        wfsSorted, lemmataSorted = self.sort_words()

        for langID in range(len(self.languages)):
            iWord = 0
            print('Processing words in ' + self.languages[langID] + '...')
            lemmaFreqs = {}  # lemma ID -> its frequency
            lemmaDIDs = {}  # lemma ID -> its document IDs
            wFreqsSorted = [
                v
                for v in sorted(self.wordFreqs[langID].values(), reverse=True)
            ]
            freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted)
            # for wID in self.wordFreqs[langID]:
            for w, wID in self.tmpWordIDs[langID].items():
                if iWord % 500 == 0:
                    print('indexing word', iWord)
                try:
                    lID = self.word2lemma[langID][wID]
                except KeyError:
                    lID = 0
                wJson = json.loads(w)
                wfOrder = len(wfsSorted) + 1
                if 'wf' in wJson:
                    wfOrder = wfsSorted[wJson['wf']]
                lOrder = len(lemmataSorted) + 1
                if 'ana' in wJson:
                    lOrder = lemmataSorted[self.get_lemma(wJson)]
                wJson['wf_order'] = wfOrder
                wJson['l_order'] = lOrder
                wordFreq = self.wordFreqs[langID][wID]
                wJson['freq'] = wordFreq
                try:
                    lemmaFreqs[lID] += wordFreq
                except KeyError:
                    lemmaFreqs[lID] = wordFreq
                if lID != 0:
                    try:
                        lemmaDIDs[lID] |= self.wordDIDs[langID][wID]
                    except KeyError:
                        lemmaDIDs[lID] = set(self.wordDIDs[langID][wID])
                # wJson['sids'] = [sid for sid in sorted(
                # self.wordSIDs[langID][wID])]
                wJson['dids'] = [
                    did for did in sorted(self.wordDIDs[langID][wID])
                ]
                wJson['n_sents'] = self.wordSFreqs[langID][wID]
                wJson['n_docs'] = len(wJson['dids'])
                wJson['rank_true'] = freqToRank[wJson['freq']]
                # for the calculations
                wJson['rank'] = self.quantile_label(wJson['freq'],
                                                    wJson['rank_true'],
                                                    quantiles)  # for the user
                curAction = {
                    '_index': self.name + '.words',
                    '_type': 'word',
                    '_id': wID,
                    '_source': wJson,
                    '_parent': lID
                }
                yield curAction

                for docID in wJson['dids']:
                    wfreqJson = {
                        'w_id': wID,
                        'd_id': docID,
                        'wf_order': wfOrder,
                        'l_order': lOrder,
                        'freq': self.wordDocFreqs[langID][(wID, docID)]
                    }
                    curAction = {
                        '_index': self.name + '.words',
                        '_type': 'word_freq',
                        '_id': self.wordFreqID,
                        '_source': wfreqJson,
                        '_parent': wID,
                        '_routing': lID
                    }
                    self.wordFreqID += 1
                    yield curAction
                iWord += 1
                self.wID += 1
            for lAction in self.iterate_lemmata(langID, lemmaFreqs, lemmaDIDs):
                yield lAction
        emptyLemmaJson = {'wf': 0, 'freq': 0, 'rank_true': -1}
        curAction = {
            '_index': self.name + '.words',
            '_type': 'lemma',
            '_id': 0,
            '_source': emptyLemmaJson
        }
        yield curAction

    def index_words(self):
        """
        Index all words that have been collected at the previous stage
        in self.words (while the sentences were being indexed).
        """
        bulk(self.es, self.iterate_words(), chunk_size=300, request_timeout=60)

    def add_parallel_sids(self, sentences, paraIDs):
        """
        In the parallel corpus, add the IDs of aligned sentences in other
        languages to each sentence that has a para_id.
        """
        for s in sentences:
            if ('para_alignment' not in s['_source']
                    or 'lang' not in s['_source']):
                continue
            langID = s['_source']['lang']
            for pa in s['_source']['para_alignment']:
                paraID = pa['para_id']
                pa['sent_ids'] = []
                for i in range(len(self.languages)):
                    if i == langID:
                        continue
                    if paraID in paraIDs[i]:
                        pa['sent_ids'] += paraIDs[i][paraID]

    def iterate_sentences(self, fname):
        self.numSents = 0
        sentences = []
        paraIDs = [{} for i in range(len(self.languages))]
        for s, bLast in self.iterSent.get_sentences(fname):
            if 'lang' in s:
                langID = s['lang']
            else:
                langID = 0
                s['lang'] = langID
            if 'words' in s:
                self.process_sentence_words(s['words'], langID)
            if self.numSents > 0:
                s['prev_id'] = self.randomize_id(self.sID - 1)
            if not bLast and 'last' not in s:
                s['next_id'] = self.randomize_id(self.sID + 1)
            s['doc_id'] = self.dID
            # self.es.index(index=self.name + '.sentences',
            #               doc_type='sentence',
            #               id=self.sID,
            #               body=s)
            curAction = {
                '_index': self.name + '.sentences',
                '_type': 'sentence',
                '_id': self.randomize_id(self.sID),
                '_source': s
            }
            if len(self.languages) <= 1:
                yield curAction
            else:
                sentences.append(curAction)
                if 'para_alignment' in s:
                    s['para_ids'] = []
                    for pa in s['para_alignment']:
                        paraID = str(self.dID) + '_' + str(pa['para_id'])
                        pa['para_id'] = paraID
                        s['para_ids'].append(paraID)
                        try:
                            paraIDs[langID][paraID].append(
                                self.randomize_id(self.sID))
                        except KeyError:
                            paraIDs[langID][paraID] = ([
                                self.randomize_id(self.sID)
                            ])
            if self.sID % 100000 == 0:
                print('Indexing sentence', self.sID, ',', self.totalNumWords,
                      'words so far.')
            self.numSents += 1
            self.numSentsLang[langID] += 1
            self.sID += 1
        if len(self.languages) > 1:
            self.add_parallel_sids(sentences, paraIDs)
            for s in sentences:
                yield s

    @staticmethod
    def add_meta_keywords(meta):
        """
        For each text field in the metadata, add a keyword version
        of the same field.
        """
        for field in [k for k in meta.keys() if not k.startswith('year')]:
            meta[field + '_kw'] = meta[field]

    def index_doc(self, fname):
        """
        Store the metadata of the source file.
        """
        if self.dID % 100 == 0:
            print('Indexing document', self.dID)
        meta = self.iterSent.get_metadata(fname)
        self.add_meta_keywords(meta)
        meta['n_words'] = self.numWords
        meta['n_sents'] = self.numSents
        if len(self.settings['languages']) > 1:
            for i in range(len(self.languages)):
                meta['n_words_' + self.languages[i]] = self.numWordsLang[i]
                meta['n_sents_' + self.languages[i]] = self.numSentsLang[i]
        self.numWords = 0
        self.numSents = 0
        self.numWordsLang = [0] * len(self.languages)
        self.numSentsLang = [0] * len(self.languages)
        try:
            self.es.index(index=self.name + '.docs',
                          doc_type='doc',
                          id=self.dID,
                          body=meta)
        except RequestError as err:
            print('Metadata error: {0}'.format(err))
            shortMeta = {}
            if 'filename' in meta:
                shortMeta['filename'] = meta['filename']
            if 'title' in meta:
                shortMeta['title'] = meta['title']
                shortMeta['title_kw'] = meta['title']
                self.es.index(index=self.name + '.docs',
                              doc_type='doc',
                              id=self.dID,
                              body=shortMeta)
        self.dID += 1

    def index_dir(self):
        """
        Index all files from the corpus directory, sorted by their size
        in decreasing order. Such sorting helps prevent memory errors
        when indexing large corpora, as the default behavior is to load
        the whole file is into memory, and there is more free memory
        in the beginning of the process. If MemoryError occurs, the
        iterative JSON parser is used, which works much slower.
        """
        filenames = []
        for root, _, files in os.walk(self.corpus_dir):
            for fname in files:
                if (not ((self.settings['input_format'] == 'json'
                          and fname.lower().endswith('.json')) or
                         (self.settings['input_format'] == 'json-gzip'
                          and fname.lower().endswith('.json.gz')))):
                    continue
                fnameFull = os.path.join(root, fname)
                filenames.append((fnameFull, os.path.getsize(fnameFull)))
        if len(filenames) <= 0:
            print('There are no files in this corpus.')
            return
        for fname, _ in sorted(filenames, key=lambda p: -p[1]):
            # print(fname, fsize)
            try:
                bulk(self.es,
                     self.iterate_sentences(fname),
                     chunk_size=200,
                     request_timeout=60)
                self.index_doc(fname)
            except Exception:
                print(fname)
        self.index_words()

    def compile_translations(self):
        """
        Compile flask_babel translations in ../search/web_app.
        """
        pythonPath = ''
        for p in sys.path:
            if re.search('Python3[^/\\\\]*[/\\\\]?$', p) is not None:
                pythonPath = p
                break
        if len(pythonPath) <= 0:
            pyBabelPath = 'pybabel'
        else:
            pyBabelPath = os.path.join(pythonPath, 'Scripts', 'pybabel')
        try:
            subprocess.run([pyBabelPath, 'compile', '-d', 'translations'],
                           cwd='../search/web_app',
                           check=True)
        except Exception:
            print('Could not compile translations with ' + pyBabelPath + ' .')
        else:
            print('Interface translations compiled.')

    def load_corpus(self):
        """
        Drop the current database, if any, and load the entire corpus.
        """
        t1 = time.time()
        self.compile_translations()
        self.delete_indices()
        self.create_indices()
        self.index_dir()
        t2 = time.time()
        print('Corpus indexed in', t2 - t1, 'seconds:', self.dID, 'documents,',
              self.sID, 'sentences,', self.totalNumWords, 'words,',
              sum(len(self.wordFreqs[i]) for i in range(len(self.languages))),
              'word types (different words).')