Exemple #1
0
 def __init__(self):
     print(os.path.join(self.SETTINGS_DIR, 'corpus.json'))
     f = open(os.path.join(self.SETTINGS_DIR, 'corpus.json'),
              'r', encoding='utf-8')
     self.settings = json.loads(f.read())
     f.close()
     self.name = self.settings['corpus_name']
     self.languages = self.settings['languages']
     if len(self.languages) <= 0:
         self.languages = [self.name]
     self.input_format = self.settings['input_format']
     self.corpus_dir = os.path.join('../corpus', self.name)
     self.iterSent = None
     if self.input_format in ['json', 'json-gzip']:
         self.iterSent = JSONDocReader(format=self.input_format)
     self.goodWordFields = ['lex', 'wf', 'wf_display',
                            'parts', 'gloss', 'gloss_index', 'n_ana',
                            'trans_en', 'trans_ru']
     self.AdditionalWordFields = set()
     if 'word_fields' in self.settings:
         self.AdditionalWordFields |= set(self.settings['word_fields'])
     if 'word_table_fields' in self.settings:
         self.AdditionalWordFields |= set(self.settings['word_table_fields'])
     if 'accidental_word_fields' in self.settings:
         self.AdditionalWordFields -= set(self.settings['accidental_word_fields'])
     f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'),
              'r', encoding='utf-8')
     categories = json.loads(f.read())
     self.goodWordFields += ['gr.' + v for lang in categories
                             for v in categories[lang].values()]
     self.goodWordFields = set(self.goodWordFields)
     f.close()
     self.pd = PrepareData()
     self.es = Elasticsearch()
     self.es_ic = IndicesClient(self.es)
     self.shuffled_ids = [i for i in range(1, 1000000)]
     random.shuffle(self.shuffled_ids)
     self.shuffled_ids.insert(0, 0)    # id=0 is special and should not change
     self.tmpWordIDs = [{} for i in range(len(self.languages))]    # word as JSON -> its integer ID
     self.tmpLemmaIDs = [{} for i in range(len(self.languages))]   # lemma as string -> its integer ID
     self.word2lemma = [{} for i in range(len(self.languages))]    # word's ID -> ID of its lemma (or -1, if none)
     self.wordFreqs = [{} for i in range(len(self.languages))]     # word's ID -> its frequency
     self.wordSFreqs = [{} for i in range(len(self.languages))]    # word's ID -> its number of sentences
     self.wordDocFreqs = [{} for i in range(len(self.languages))]  # (word's ID, dID) -> word frequency in the document
     # self.wordSIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of sentence IDs
     self.wordDIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of document IDs
     self.wfs = set()         # set of word forms (for sorting)
     self.lemmata = set()     # set of lemmata (for sorting)
     self.sID = 0          # current sentence ID for each language
     self.dID = 0          # current document ID
     self.wID = 0          # current word ID
     self.wordFreqID = 0
     self.numWords = 0     # number of words in current document
     self.numSents = 0     # number of sentences in current document
     self.numWordsLang = [0] * len(self.languages)    # number of words in each language in current document
     self.numSentsLang = [0] * len(self.languages)    # number of sentences in each language in current document
     self.totalNumWords = 0
Exemple #2
0
 def __init__(self, settings):
     self.settings = CorpusSettings()
     self.settings.load_settings(
         os.path.join(self.SETTINGS_DIR, 'corpus.json'),
         os.path.join(self.SETTINGS_DIR, 'categories.json'))
     self.sentView = SentenceViewer(self.settings, None, fullText=True)
     self.iterSent = None
     if self.settings.input_format in ['json', 'json-gzip']:
         self.iterSent = JSONDocReader(format=self.settings.input_format,
                                       settings=settings)
     self.lastSentNum = 0  # for the IDs in the HTML
 def __init__(self,
              input_format='json',
              lang='kpv',
              langCode=0,
              minAnalyzed=0.66,
              alphabet='[а-яёӧі -]'):
     """
     Only add sentences where the "lang" attribute equals langCode.
     OExclude sentences where the share of analyzed words is less
     than minAnalyzed.
     """
     self.lang = lang
     self.input_format = input_format
     self.langCode = langCode
     self.minAnalyzed = minAnalyzed
     self.lp = LanguageProcessor(self.lang)
     self.iterSent = None
     self.rxAlphabet = re.compile('^' + alphabet + '+$')
     if self.input_format not in ['json', 'json-gzip']:
         print('Format should equal either "json" or "json-gzip".')
     else:
         self.iterSent = JSONDocReader(format=self.input_format)
Exemple #4
0
    def __init__(self, overwrite=False):
        self.overwrite = overwrite  # whether to overwrite an existing index without asking
        with open(os.path.join(self.SETTINGS_DIR, 'corpus.json'),
                  'r',
                  encoding='utf-8') as fSettings:
            self.settings = json.load(fSettings)
        self.j2h = JSON2HTML(settings=self.settings)
        self.name = self.settings['corpus_name']
        self.languages = self.settings['languages']
        if len(self.languages) <= 0:
            self.languages = [self.name]
        self.input_format = self.settings['input_format']
        self.corpus_dir = os.path.join('../corpus', self.name)
        self.iterSent = None
        if self.input_format in ['json', 'json-gzip']:
            self.iterSent = JSONDocReader(format=self.input_format,
                                          settings=self.settings)

        # Make sure only commonly used word fields and those listed
        # in corpus.json get into the words index.
        self.goodWordFields = [
            'lex',  # lemma
            'wf',  # word form (for search)
            'wf_display',  # word form (for display; optional)
            'parts',  # morpheme breaks in the word form
            'gloss',  # glosses (for display)
            'gloss_index',  # glosses (for search)
            'n_ana'  # number of analyses
        ]
        self.additionalWordFields = set()
        if 'word_fields' in self.settings:
            self.additionalWordFields |= set(self.settings['word_fields'])
        if 'word_table_fields' in self.settings:
            self.additionalWordFields |= set(
                self.settings['word_table_fields'])
        if 'accidental_word_fields' in self.settings:
            self.additionalWordFields -= set(
                self.settings['accidental_word_fields'])
        f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'),
                 'r',
                 encoding='utf-8')
        categories = json.loads(f.read())
        f.close()
        self.goodWordFields += [
            'gr.' + v for lang in categories
            for v in categories[lang].values()
        ]
        self.goodWordFields = set(self.goodWordFields)
        self.characterRegexes = {}

        self.pd = PrepareData()
        self.es = Elasticsearch()
        self.es_ic = IndicesClient(self.es)
        self.shuffled_ids = [i for i in range(1, 1000000)]
        random.shuffle(self.shuffled_ids)
        self.shuffled_ids.insert(0, 0)  # id=0 is special and should not change
        self.tmpWordIDs = [{} for i in range(len(self.languages))
                           ]  # word as JSON -> its integer ID
        self.tmpLemmaIDs = [{} for i in range(len(self.languages))
                            ]  # lemma as string -> its integer ID
        # Apart from the two dictionaries above, words and lemmata
        # have string IDs starting with 'w' or 'l' followed by an integer
        self.word2lemma = [
            {} for i in range(len(self.languages))
        ]  # word/lemma ID -> ID of its lemma (or -1, if none)
        self.wordFreqs = [{} for i in range(len(self.languages))
                          ]  # word/lemma ID -> its frequency
        self.wordSFreqs = [{} for i in range(len(self.languages))
                           ]  # word/lemma ID -> its number of sentences
        self.wordDocFreqs = [
            {} for i in range(len(self.languages))
        ]  # (word/lemma ID, dID) -> word frequency in the document
        # self.wordSIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of sentence IDs
        self.wordDIDs = [{} for i in range(len(self.languages))
                         ]  # word/lemma ID -> set of document IDs
        self.wfs = set()  # set of word forms (for sorting)
        self.lemmata = set()  # set of lemmata (for sorting)
        self.sID = 0  # current sentence ID for each language
        self.dID = 0  # current document ID
        self.wID = 0  # current word ID
        self.wordFreqID = 0  # current word_freq ID for word/document frequencies
        self.lemmaFreqID = 0  # current word_freq ID for lemma/document frequencies
        self.numWords = 0  # number of words in current document
        self.numSents = 0  # number of sentences in current document
        self.numWordsLang = [0] * len(
            self.languages
        )  # number of words in each language in current document
        self.numSentsLang = [0] * len(
            self.languages
        )  # number of sentences in each language in current document
        self.totalNumWords = 0
Exemple #5
0
class Indexator:
    """
    Contains methods for loading the JSON documents in the corpus
    database.
    """
    SETTINGS_DIR = '../conf'
    rxBadFileName = re.compile('[^\\w_.-]*', flags=re.DOTALL)

    def __init__(self, overwrite=False):
        self.overwrite = overwrite  # whether to overwrite an existing index without asking
        with open(os.path.join(self.SETTINGS_DIR, 'corpus.json'),
                  'r',
                  encoding='utf-8') as fSettings:
            self.settings = json.load(fSettings)
        self.j2h = JSON2HTML(settings=self.settings)
        self.name = self.settings['corpus_name']
        self.languages = self.settings['languages']
        if len(self.languages) <= 0:
            self.languages = [self.name]
        self.input_format = self.settings['input_format']
        self.corpus_dir = os.path.join('../corpus', self.name)
        self.iterSent = None
        if self.input_format in ['json', 'json-gzip']:
            self.iterSent = JSONDocReader(format=self.input_format,
                                          settings=self.settings)

        # Make sure only commonly used word fields and those listed
        # in corpus.json get into the words index.
        self.goodWordFields = [
            'lex',  # lemma
            'wf',  # word form (for search)
            'wf_display',  # word form (for display; optional)
            'parts',  # morpheme breaks in the word form
            'gloss',  # glosses (for display)
            'gloss_index',  # glosses (for search)
            'n_ana'  # number of analyses
        ]
        self.additionalWordFields = set()
        if 'word_fields' in self.settings:
            self.additionalWordFields |= set(self.settings['word_fields'])
        if 'word_table_fields' in self.settings:
            self.additionalWordFields |= set(
                self.settings['word_table_fields'])
        if 'accidental_word_fields' in self.settings:
            self.additionalWordFields -= set(
                self.settings['accidental_word_fields'])
        f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'),
                 'r',
                 encoding='utf-8')
        categories = json.loads(f.read())
        f.close()
        self.goodWordFields += [
            'gr.' + v for lang in categories
            for v in categories[lang].values()
        ]
        self.goodWordFields = set(self.goodWordFields)
        self.characterRegexes = {}

        self.pd = PrepareData()
        self.es = Elasticsearch()
        self.es_ic = IndicesClient(self.es)
        self.shuffled_ids = [i for i in range(1, 1000000)]
        random.shuffle(self.shuffled_ids)
        self.shuffled_ids.insert(0, 0)  # id=0 is special and should not change
        self.tmpWordIDs = [{} for i in range(len(self.languages))
                           ]  # word as JSON -> its integer ID
        self.tmpLemmaIDs = [{} for i in range(len(self.languages))
                            ]  # lemma as string -> its integer ID
        # Apart from the two dictionaries above, words and lemmata
        # have string IDs starting with 'w' or 'l' followed by an integer
        self.word2lemma = [
            {} for i in range(len(self.languages))
        ]  # word/lemma ID -> ID of its lemma (or -1, if none)
        self.wordFreqs = [{} for i in range(len(self.languages))
                          ]  # word/lemma ID -> its frequency
        self.wordSFreqs = [{} for i in range(len(self.languages))
                           ]  # word/lemma ID -> its number of sentences
        self.wordDocFreqs = [
            {} for i in range(len(self.languages))
        ]  # (word/lemma ID, dID) -> word frequency in the document
        # self.wordSIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of sentence IDs
        self.wordDIDs = [{} for i in range(len(self.languages))
                         ]  # word/lemma ID -> set of document IDs
        self.wfs = set()  # set of word forms (for sorting)
        self.lemmata = set()  # set of lemmata (for sorting)
        self.sID = 0  # current sentence ID for each language
        self.dID = 0  # current document ID
        self.wID = 0  # current word ID
        self.wordFreqID = 0  # current word_freq ID for word/document frequencies
        self.lemmaFreqID = 0  # current word_freq ID for lemma/document frequencies
        self.numWords = 0  # number of words in current document
        self.numSents = 0  # number of sentences in current document
        self.numWordsLang = [0] * len(
            self.languages
        )  # number of words in each language in current document
        self.numSentsLang = [0] * len(
            self.languages
        )  # number of sentences in each language in current document
        self.totalNumWords = 0

    def delete_indices(self):
        """
        If there already exist indices with the same names,
        ask the user if they want to overwrite them. If they
        say yes, remove the indices and return True. Otherwise,
        return False.
        """
        if not self.overwrite:
            if (self.es_ic.exists(index=self.name + '.docs')
                    or self.es_ic.exists(index=self.name + '.words')
                    or self.es_ic.exists(index=self.name + '.sentences')):
                print('It seems that a corpus named "' + self.name +
                      '" already exists. ' +
                      'Do you want to overwrite it? [y/n]')
                reply = input()
                if reply.lower() != 'y':
                    print('Indexation aborted.')
                    return False
        if self.es_ic.exists(index=self.name + '.docs'):
            self.es_ic.delete(index=self.name + '.docs')
        if self.es_ic.exists(index=self.name + '.words'):
            self.es_ic.delete(index=self.name + '.words')
        if self.es_ic.exists(index=self.name + '.sentences'):
            self.es_ic.delete(index=self.name + '.sentences')
        # Obsolete index word_freq can be present in pre-2019 corpora
        if self.es_ic.exists(index=self.name + '.word_freqs'):
            self.es_ic.delete(index=self.name + '.word_freqs')
        return True

    def create_indices(self):
        """
        Create empty elasticsearch indices for corpus data, using
        mappings provided by PrepareData.
        """
        self.sentWordMapping = self.pd.generate_words_mapping(wordFreqs=False)
        self.wordMapping = self.pd.generate_words_mapping(wordFreqs=True)
        self.sentMapping = self.pd.generate_sentences_mapping(
            self.sentWordMapping)
        self.docMapping = self.pd.generate_docs_mapping()

        self.es_ic.create(index=self.name + '.docs', body=self.docMapping)
        self.es_ic.create(index=self.name + '.words', body=self.wordMapping)
        self.es_ic.create(index=self.name + '.sentences',
                          body=self.sentMapping)

    def randomize_id(self, realID):
        """
        Return a (relatively) randomized sentence ID. This randomization
        is needed in context-aware word queries where the sentences
        are iterated in the order determined by their IDs.
        """
        if realID < 0:
            return realID
        idStart, idEnd = realID // 1000000, realID % 1000000
        return idStart * 1000000 + self.shuffled_ids[idEnd]

    def enhance_word(self, word):
        """
        Add some calculated fields to the JSON word.
        """
        if 'ana' not in word:
            word['n_ana'] = 0
        else:
            word['n_ana'] = len(word['ana'])
            # n_ana is a (signed) byte, so a word can have at most 127 analyses
            if word['n_ana'] >= 127:
                word['n_ana'] = 127

    def clean_word(self, w, langID):
        """
        Clean a word object by removing unnecessary fields, lowercasing
        things if needed, etc. Return the cleaned object and the lemma.
        Add word form and lemma to the global lists.
        """
        wClean = {'lang': langID}
        lemma = ''
        for field in w:
            if field in self.goodWordFields or field in self.additionalWordFields:
                wClean[field] = w[field]
                if field == 'wf':
                    if 'wf_lowercase' not in self.settings or self.settings[
                            'wf_lowercase']:
                        wClean[field] = wClean[field].lower()
                    self.wfs.add(wClean[field])
        if 'ana' in w:
            lemma = self.get_lemma(w)
            self.lemmata.add(lemma)
            wClean['ana'] = []
            for ana in w['ana']:
                cleanAna = {}
                for anaField in ana:
                    if anaField in self.goodWordFields or anaField in self.additionalWordFields:
                        cleanAna[anaField] = ana[anaField]
                wClean['ana'].append(cleanAna)
        return wClean, lemma

    def process_sentence_words(self, words, langID):
        """
        Take words from a sentence, remove all non-searchable
        fields from them and add them to self.words dictionary.
        Add w_id and l_id properties to each word of the words list.
        Return the value of the 'sent_analyzed' meta field.
        """
        sIDAdded = set(
        )  # word IDs for which the current settence ID has been counted
        bFullyAnalyzed = True  # Whether each word in the sentence is analyzed
        bUniquelyAnalyzed = True  # Whether, in addition, each word has exactly one analysis
        for w in words:
            if w['wtype'] != 'word':
                continue
            self.numWords += 1
            self.numWordsLang[langID] += 1
            self.totalNumWords += 1
            self.enhance_word(w)

            if 'ana' not in w or len(w['ana']) <= 0:
                bFullyAnalyzed = False
                bUniquelyAnalyzed = False
            elif len(w['ana']) > 1:
                bUniquelyAnalyzed = False

            wClean, lemma = self.clean_word(w, langID)
            wCleanTxt = json.dumps(wClean, ensure_ascii=False, sort_keys=True)
            if wCleanTxt in self.tmpWordIDs[langID]:
                wID = self.tmpWordIDs[langID][wCleanTxt]
            else:
                wID = sum(
                    len(self.tmpWordIDs[i])
                    for i in range(len(self.languages)))
                self.tmpWordIDs[langID][wCleanTxt] = wID
            wID = 'w' + str(wID)
            w['w_id'] = wID
            lID = 'l0'  # Default: no analysis
            if len(lemma) > 0:
                try:
                    lID = self.tmpLemmaIDs[langID][lemma]
                except KeyError:
                    lID = sum(
                        len(self.tmpLemmaIDs[i])
                        for i in range(len(self.languages))) + 1
                    self.tmpLemmaIDs[langID][lemma] = lID
                lID = 'l' + str(lID)
                self.word2lemma[langID][wID] = lID
            w['l_id'] = lID
            for itemID in [wID, lID]:
                try:
                    self.wordFreqs[langID][itemID] += 1
                except KeyError:
                    self.wordFreqs[langID][itemID] = 1
                if itemID not in sIDAdded:
                    sIDAdded.add(itemID)
                    try:
                        self.wordSFreqs[langID][itemID] += 1
                    except KeyError:
                        self.wordSFreqs[langID][itemID] = 1
                try:
                    self.wordDIDs[langID][itemID].add(self.dID)
                except KeyError:
                    self.wordDIDs[langID][itemID] = {self.dID}
                try:
                    self.wordDocFreqs[langID][(itemID, self.dID)] += 1
                except KeyError:
                    self.wordDocFreqs[langID][(itemID, self.dID)] = 1
        if not bFullyAnalyzed:
            return 'incomplete'
        if not bUniquelyAnalyzed:
            return 'complete'
        return 'unique'

    def character_regex(self, lang):
        """
        Regex for splitting text into characters. Takes into account
        multicharacter sequences (digraphs etc.) defined in lang_props.lexicographic_order.
        """
        if lang in self.characterRegexes:
            return self.characterRegexes[lang]  # cache
        if lang not in self.settings[
                'lang_props'] or 'lexicographic_order' not in self.settings[
                    'lang_props'][lang]:
            self.characterRegexes[lang] = re.compile('.')
            return self.characterRegexes[lang]
        rxChars = '(' + '|'.join(
            re.escape(c.lower()) for c in sorted(
                self.settings['lang_props'][lang]['lexicographic_order'],
                key=lambda x: (-len(x), x)) if len(c) > 1)
        if len(rxChars) > 1:
            rxChars += '|'
        rxChars += '.)'
        rxChars = re.compile(rxChars)
        self.characterRegexes[lang] = rxChars
        return rxChars

    def make_sorting_function(self, lang):
        """
        Return a function that can be used for sorting tokens
        in a list according to the alphabetical ordering specified
        for the language lang.
        """
        sortingFunction = lambda x: x
        if lang in self.settings[
                'lang_props'] and 'lexicographic_order' in self.settings[
                    'lang_props'][lang]:
            dictSort = {
                self.settings['lang_props'][lang]['lexicographic_order'][i]:
                (i,
                 self.settings['lang_props'][lang]['lexicographic_order'][i])
                for i in range(
                    len(self.settings['lang_props'][lang]
                        ['lexicographic_order']))
            }
            maxIndex = len(dictSort)
            rxChars = self.character_regex(lang)

            def charReplaceFunction(c):
                if c in dictSort:
                    return dictSort[c]
                return (maxIndex, c)

            sortingFunction = lambda x: [
                charReplaceFunction(c) for c in rxChars.findall(x.lower())
            ]
        return sortingFunction

    def sort_words(self, lang):
        """
        Sort word forms and lemmata stored at earlier stages.
        Return dictionaries with positions of word forms and
        lemmata in the sorted list.
        If there is a custom alphabetical order for the language,
        use it. Otherwise, use standard lexicographic sorting.
        """
        wfsSorted = {}
        iOrder = 0
        sortingFunction = self.make_sorting_function(lang)
        for wf in sorted(self.wfs, key=sortingFunction):
            wfsSorted[wf] = iOrder
            iOrder += 1
        lemmataSorted = {}
        iOrder = 0
        for l in sorted(self.lemmata, key=sortingFunction):
            lemmataSorted[l] = iOrder
            iOrder += 1
        return wfsSorted, lemmataSorted

    def get_freq_ranks(self, freqsSorted):
        """
        Calculate frequency ranks and rank/quantile labels for words
        or lemmata.
        """
        freqToRank = {}
        quantiles = {}
        prevFreq = 0
        prevRank = 0
        for i in range(len(freqsSorted)):
            v = freqsSorted[i]
            if v != prevFreq:
                if prevFreq != 0:
                    freqToRank[prevFreq] = prevRank + (i - prevRank) // 2
                prevRank = i
                prevFreq = v
        if prevFreq != 0:
            freqToRank[prevFreq] = prevRank + (len(freqsSorted) -
                                               prevRank) // 2
        for q in [0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]:
            qIndex = math.ceil(q * len(freqsSorted))
            if qIndex >= len(freqsSorted):
                qIndex = len(freqsSorted) - 1
            if qIndex >= 0:
                quantiles[q] = freqsSorted[qIndex]
            else:
                quantiles[q] = 0
        return freqToRank, quantiles

    def quantile_label(self, freq, rank, quantiles):
        """
        Return a string label of the frequency rank (for frequent items)
        or quantile. This label is showed to the user in word query results.
        """
        if freq > 1 and freq >= quantiles[0.5]:
            if freq > quantiles[0.03]:
                return '#' + str(rank + 1)
            else:
                return '&gt; ' + str(
                    min(
                        math.ceil(q * 100)
                        for q in quantiles if freq >= quantiles[q])) + '%'
        return ''

    def get_lemma(self, word, lower_lemma=True):
        """
        Join all lemmata in the JSON representation of a word with
        an analysis and return them as a string.
        """
        if 'ana' not in word:
            return ''
        if 'keep_lemma_order' not in self.settings or not self.settings[
                'keep_lemma_order']:
            curLemmata = set()
            for ana in word['ana']:
                if 'lex' in ana:
                    if type(ana['lex']) == list:
                        for l in ana['lex']:
                            lAdd = l
                            if lower_lemma:
                                lAdd = lAdd.lower()
                            curLemmata.add(lAdd)
                    else:
                        lAdd = ana['lex']
                        if lower_lemma:
                            lAdd = lAdd.lower()
                        curLemmata.add(lAdd)
            return '/'.join(l for l in sorted(curLemmata))
        curLemmata = []
        for ana in word['ana']:
            if 'lex' in ana:
                if type(ana['lex']) == list:
                    for l in ana['lex']:
                        lAdd = l
                        if lower_lemma:
                            lAdd = lAdd.lower()
                        curLemmata.append(lAdd)
                else:
                    lAdd = ana['lex']
                    if lower_lemma:
                        lAdd = lAdd.lower()
                    curLemmata.append(lAdd)
        return '/'.join(curLemmata)

    def get_grdic(self, word, lang):
        """
        Join all dictionary grammar tags strings in the JSON representation of a word with
        an analysis and return them as a string.
        """
        if 'ana' not in word:
            return ''
        curGramm = set()
        translations = set()
        for ana in word['ana']:
            grTags = ''
            if 'gr.pos' in ana:
                value = ana['gr.pos']
                if type(value) == list:
                    value = ', '.join(value)
                grTags = value
            for field in sorted(ana):
                value = ana[field]
                if type(value) == list:
                    value = ', '.join(value)
                if ('lang_props' in self.settings
                        and lang in self.settings['lang_props']
                        and 'dictionary_categories'
                        in self.settings['lang_props'][lang]
                        and field.startswith('gr.')
                        and field[3:] in self.settings['lang_props'][lang]
                    ['dictionary_categories']):
                    if len(grTags) > 0:
                        grTags += ', '
                    grTags += value
                elif field.startswith('trans_'):
                    translations.add(value)
            if len(grTags) > 0:
                curGramm.add(grTags)
        return ' | '.join(grdic for grdic in sorted(curGramm)), ' | '.join(
            tr for tr in sorted(translations))

    def iterate_lemmata(self, langID, lemmataSorted):
        """
        Iterate over all lemmata for one language collected at the
        word iteration stage.
        """
        lFreqsSorted = [
            self.wordFreqs[langID][itemID] for itemID in self.wordFreqs[langID]
            if itemID.startswith('l')
        ]
        lFreqsSorted.sort(reverse=True)
        lemmaFreqToRank, quantiles = self.get_freq_ranks(lFreqsSorted)
        iLemma = 0
        for l, lID in self.tmpLemmaIDs[langID].items():
            lID = 'l' + str(lID)
            if iLemma % 250 == 0:
                print('indexing lemma', iLemma)
            lOrder = lemmataSorted[l]
            lemmaJson = {
                'wf':
                l,
                'wtype':
                'lemma',
                'lang':
                langID,
                'l_order':
                lOrder,
                'freq':
                self.wordFreqs[langID][lID],
                'lemma_freq':
                self.wordFreqs[langID][lID],
                'rank_true':
                lemmaFreqToRank[self.wordFreqs[langID][lID]],
                'rank':
                self.quantile_label(
                    self.wordFreqs[langID][lID],
                    lemmaFreqToRank[self.wordFreqs[langID][lID]], quantiles),
                'n_sents':
                self.wordSFreqs[langID][lID],
                'n_docs':
                len(self.wordDIDs[langID][lID]),
                'freq_join':
                'word'
            }
            curAction = {
                '_index': self.name + '.words',
                '_id': lID,
                '_source': lemmaJson
            }
            iLemma += 1
            yield curAction

            for docID in self.wordDIDs[langID][lID]:
                lfreqJson = {
                    'wtype': 'word_freq',
                    'l_id': lID,
                    'd_id': docID,
                    'l_order': lOrder,
                    'freq': self.wordDocFreqs[langID][(lID, docID)],
                    'freq_join': {
                        'name': 'word_freq',
                        'parent': lID
                    }
                }
                curAction = {
                    '_index': self.name + '.words',
                    '_id': 'lfreq' + str(self.lemmaFreqID),
                    '_source': lfreqJson,
                    '_routing': lID
                }
                self.lemmaFreqID += 1
                yield curAction

    def iterate_words(self):
        """
        Iterate through all words collected at the previous
        stage. Return JSON objects with actions for bulk indexing
        in Elasticsearch.
        """
        self.wID = 0

        for langID in range(len(self.languages)):
            wfsSorted, lemmataSorted = self.sort_words(self.languages[langID])
            iWord = 0
            print('Processing words in ' + self.languages[langID] + '...')

            wFreqsSorted = [
                self.wordFreqs[langID][itemID]
                for itemID in self.wordFreqs[langID] if itemID.startswith('w')
            ]
            wFreqsSorted.sort(reverse=True)
            wordFreqToRank, quantiles = self.get_freq_ranks(wFreqsSorted)

            lFreqsSorted = [
                self.wordFreqs[langID][itemID]
                for itemID in self.wordFreqs[langID] if itemID.startswith('l')
            ]
            lFreqsSorted.sort(reverse=True)
            lemmaFreqToRank, lemmaQuantiles = self.get_freq_ranks(lFreqsSorted)

            # for wID in self.wordFreqs[langID]:
            for w, wID in self.tmpWordIDs[langID].items():
                wID = 'w' + str(wID)
                if iWord % 500 == 0:
                    print('indexing word', iWord)
                try:
                    lID = self.word2lemma[langID][wID]
                except KeyError:
                    lID = 'l0'
                wJson = json.loads(w)
                wfOrder = len(wfsSorted) + 1
                if 'wf' in wJson:
                    wfOrder = wfsSorted[wJson['wf']]
                lOrder = len(lemmataSorted) + 1
                if 'ana' in wJson:
                    lOrder = lemmataSorted[self.get_lemma(wJson)]
                wJson['wf_order'] = wfOrder
                wJson['l_order'] = lOrder
                wJson['l_id'] = lID
                wordFreq = self.wordFreqs[langID][wID]
                lemmaFreq = self.wordFreqs[langID][lID]
                wJson['freq'] = wordFreq
                wJson['lemma_freq'] = lemmaFreq
                # wJson['sids'] = [sid for sid in sorted(self.wordSIDs[langID][wID])]
                wJson['dids'] = [
                    did for did in sorted(self.wordDIDs[langID][wID])
                ]
                wJson['n_sents'] = self.wordSFreqs[langID][wID]
                wJson['n_docs'] = len(wJson['dids'])
                wJson['rank_true'] = wordFreqToRank[
                    wJson['freq']]  # for the calculations
                wJson['lemma_rank_true'] = lemmaFreqToRank[
                    self.wordFreqs[langID][lID]]  # for the calculations
                wJson['rank'] = self.quantile_label(wJson['freq'],
                                                    wJson['rank_true'],
                                                    quantiles)  # for the user
                wJson['freq_join'] = 'word'
                wJson['wtype'] = 'word'
                curAction = {
                    '_index': self.name + '.words',
                    '_id': wID,
                    '_source': wJson
                }
                yield curAction

                for docID in wJson['dids']:
                    wfreqJson = {
                        'wtype': 'word_freq',
                        'w_id': wID,
                        'l_id': lID,
                        'd_id': docID,
                        'wf_order': wfOrder,
                        'l_order': lOrder,
                        'freq': self.wordDocFreqs[langID][(wID, docID)],
                        'freq_join': {
                            'name': 'word_freq',
                            'parent': wID
                        }
                    }
                    curAction = {
                        '_index': self.name + '.words',
                        '_id': 'wfreq' + str(self.wordFreqID),
                        '_source': wfreqJson,
                        '_routing': wID
                    }
                    self.wordFreqID += 1
                    yield curAction
                iWord += 1
                self.wID += 1
            for lAction in self.iterate_lemmata(langID, lemmataSorted):
                yield lAction
        emptyLemmaJson = {
            'wf': '',
            'wtype': 'lemma',
            'freq': 0,
            'rank_true': -1
        }
        curAction = {
            '_index': self.name + '.words',
            '_id': 'l0',  # l prefix stands for "lemma"
            '_source': emptyLemmaJson
        }
        yield curAction
        self.wfs = None
        self.lemmata = None

    def generate_dictionary(self):
        """
        For each language, print out an HTML dictionary containing all lexemes of the corpus.
        """
        for langID in range(len(self.languages)):
            iWord = 0
            print('Generating dictionary for ' + self.languages[langID] +
                  '...')
            lexFreqs = {}  # lemma ID -> its frequency
            wFreqsSorted = [
                v
                for v in sorted(self.wordFreqs[langID].values(), reverse=True)
            ]
            freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted)
            # for wID in self.wordFreqs[langID]:
            for w, wID in self.tmpWordIDs[langID].items():
                wID = 'w' + str(wID)
                if iWord % 1000 == 0:
                    print('processing word', iWord, 'for the dictionary')
                iWord += 1
                wJson = json.loads(w)
                if 'ana' not in wJson or len(wJson['ana']) <= 0:
                    continue
                lemma = self.get_lemma(wJson, lower_lemma=False)
                grdic, translations = self.get_grdic(wJson,
                                                     self.languages[langID])
                wordFreq = self.wordFreqs[langID][wID]
                lexTuple = (lemma, grdic, translations)
                if lexTuple not in lexFreqs:
                    lexFreqs[lexTuple] = wordFreq
                else:
                    lexFreqs[lexTuple] += wordFreq
            if len(lexFreqs) <= 0:
                continue

            if not os.path.exists('../search/web_app/templates/dictionaries'):
                os.makedirs('../search/web_app/templates/dictionaries')
            fOut = open(os.path.join(
                '../search/web_app/templates/dictionaries',
                'dictionary_' + self.settings['corpus_name'] + '_' +
                self.languages[langID] + '.html'),
                        'w',
                        encoding='utf-8')
            fOut.write(
                '<h1 class="dictionary_header"> {{ _(\'Dictionary_header\') }} '
                '({{ _(\'langname_' + self.languages[langID] +
                '\') }})</h1>\n')
            prevLetter = ''
            sortingFunction = self.make_sorting_function(
                self.settings['languages'][langID])
            for lemma, grdic, trans in sorted(
                    lexFreqs,
                    key=lambda x:
                (sortingFunction(x[0].lower()), -lexFreqs[x])):
                if len(lemma) <= 0:
                    continue
                mChar = self.character_regex(self.languages[langID]).search(
                    lemma.lower())
                if mChar is None:
                    curLetter = '*'
                else:
                    curLetter = mChar.group(0)
                if curLetter != prevLetter:
                    if prevLetter != '':
                        fOut.write('</tbody>\n</table>\n')
                    fOut.write('<h2 class="dictionary_letter">' +
                               curLetter.upper() + '</h2>\n')
                    fOut.write('<table class="dictionary_table">\n<thead>\n'
                               '<th>{{ _(\'word_th_lemma\') }}</th>'
                               '<th>{{ _(\'word_th_gr\') }}</th>'
                               '<th>{{ _(\'word_th_trans_en\') }}</th>'
                               '<th>{{ _(\'word_th_frequency\') }}</th>'
                               '</thead>\n<tbody>\n')
                    prevLetter = curLetter
                fOut.write('<tr>\n<td class="dictionary_lemma">' + lemma +
                           '</td><td>' + grdic + '</td>'
                           '<td>' + trans + '</td><td>' +
                           str(lexFreqs[(lemma, grdic, trans)]) +
                           '</td></tr>\n')
            if prevLetter != '':
                fOut.write('</tbody>\n</table>\n')
            fOut.close()

    def index_words(self):
        """
        Index all words that have been collected at the previous stage
        in self.words (while the sentences were being indexed).
        """
        bulk(self.es, self.iterate_words(), chunk_size=300, request_timeout=60)
        if 'generate_dictionary' in self.settings and self.settings[
                'generate_dictionary']:
            self.generate_dictionary()

    def add_parallel_sids(self, sentences, paraIDs):
        """
        In the parallel corpus, add the IDs of aligned sentences in other languages
        to each sentence that has a para_id.
        """
        for s in sentences:
            if 'para_alignment' not in s['_source'] or 'lang' not in s[
                    '_source']:
                continue
            langID = s['_source']['lang']
            for pa in s['_source']['para_alignment']:
                paraID = pa['para_id']
                pa['sent_ids'] = []
                for i in range(len(self.languages)):
                    if i == langID:
                        continue
                    if paraID in paraIDs[i]:
                        pa['sent_ids'] += paraIDs[i][paraID]

    def iterate_sentences(self, fname):
        self.numSents = 0
        prevLast = False
        sentences = []
        paraIDs = [{} for i in range(len(self.languages))]
        for s, bLast in self.iterSent.get_sentences(fname):
            if 'lang' in s:
                langID = s['lang']
            else:
                langID = 0
                s['lang'] = langID
            s['n_words'] = 0
            if 'words' in s:
                sentAnaMeta = self.process_sentence_words(s['words'], langID)
                s['n_words'] = sum(1 for w in s['words']
                                   if 'wtype' in w and w['wtype'] == 'word')
                if 'meta' not in s:
                    s['meta'] = {}
                s['meta']['sent_analyses'] = sentAnaMeta
            if prevLast:
                prevLast = False
            elif self.numSents > 0:
                s['prev_id'] = self.randomize_id(self.sID - 1)
            if not bLast and 'last' not in s:
                s['next_id'] = self.randomize_id(self.sID + 1)
            else:
                prevLast = True
            s['doc_id'] = self.dID
            if 'meta' in s:
                for metaField in [
                        mf for mf in s['meta'].keys()
                        if not (mf.startswith('year') or mf.endswith('_kw'))
                ]:
                    s['meta'][metaField + '_kw'] = s['meta'][metaField]
            # self.es.index(index=self.name + '.sentences',
            #               id=self.sID,
            #               body=s)
            curAction = {
                '_index': self.name + '.sentences',
                '_id': self.randomize_id(self.sID),
                '_source': s
            }
            if len(self.languages) <= 1:
                yield curAction
            else:
                sentences.append(curAction)
                if 'para_alignment' in s:
                    s['para_ids'] = []
                    for pa in s['para_alignment']:
                        paraID = str(self.dID) + '_' + str(pa['para_id'])
                        pa['para_id'] = paraID
                        s['para_ids'].append(paraID)
                        try:
                            paraIDs[langID][paraID].append(
                                self.randomize_id(self.sID))
                        except KeyError:
                            paraIDs[langID][paraID] = [
                                self.randomize_id(self.sID)
                            ]
            if self.sID % 500 == 0:
                print('Indexing sentence', self.sID, ',', self.totalNumWords,
                      'words so far.')
            self.numSents += 1
            self.numSentsLang[langID] += 1
            self.sID += 1
        if len(self.languages) > 1:
            self.add_parallel_sids(sentences, paraIDs)
            for s in sentences:
                yield s

    @staticmethod
    def add_meta_keywords(meta):
        """
        For each text field in the metadata, add a keyword version
        of the same field.
        """
        for field in [k for k in meta.keys() if not k.startswith('year')]:
            meta[field + '_kw'] = meta[field]

    def index_doc(self, fname):
        """
        Store the metadata of the source file.
        """
        if self.dID % 100 == 0:
            print('Indexing document', self.dID)
        meta = self.iterSent.get_metadata(fname)
        self.add_meta_keywords(meta)
        meta['n_words'] = self.numWords
        meta['n_sents'] = self.numSents
        if len(self.settings['languages']) > 1:
            for i in range(len(self.languages)):
                meta['n_words_' + self.languages[i]] = self.numWordsLang[i]
                meta['n_sents_' + self.languages[i]] = self.numSentsLang[i]
        self.numWords = 0
        self.numSents = 0
        self.numWordsLang = [0] * len(self.languages)
        self.numSentsLang = [0] * len(self.languages)
        try:
            self.es.index(index=self.name + '.docs', id=self.dID, body=meta)
        except RequestError as err:
            print('Metadata error: {0}'.format(err))
            shortMeta = {}
            if 'filename' in meta:
                shortMeta['filename'] = meta['filename']
            if 'title' in meta:
                shortMeta['title'] = meta['title']
                shortMeta['title_kw'] = meta['title']
                self.es.index(index=self.name + '.docs',
                              id=self.dID,
                              body=shortMeta)
        if ('fulltext_view_enabled' in self.settings
                and self.settings['fulltext_view_enabled']
                and 'fulltext_id' in meta):
            fnameOut = meta['fulltext_id'] + '.json'
            self.j2h.process_file(
                fname,
                os.path.join('../search/corpus_html', self.name, fnameOut))
        self.dID += 1

    def index_dir(self):
        """
        Index all files from the corpus directory, sorted by their size
        in decreasing order. Such sorting helps prevent memory errors
        when indexing large corpora, as the default behavior is to load
        the whole file is into memory, and there is more free memory
        in the beginning of the process. If MemoryError occurs, the
        iterative JSON parser is used, which works much slower.
        """
        filenames = []
        for root, dirs, files in os.walk(self.corpus_dir):
            for fname in files:
                if (not ((self.settings['input_format'] == 'json'
                          and fname.lower().endswith('.json')) or
                         (self.settings['input_format'] == 'json-gzip'
                          and fname.lower().endswith('.json.gz')))):
                    continue
                fnameFull = os.path.join(root, fname)
                filenames.append((fnameFull, os.path.getsize(fnameFull)))
        if len(filenames) <= 0:
            print('There are no files in this corpus.')
            return
        for fname, fsize in sorted(filenames, key=lambda p: -p[1]):
            # print(fname, fsize)
            if 'sample_size' in self.settings and 0 < self.settings[
                    'sample_size'] < 1:
                # Only take a random sample of the source files (for test purposes)
                if random.random() > self.settings['sample_size']:
                    continue
            bulk(self.es,
                 self.iterate_sentences(fname),
                 chunk_size=200,
                 request_timeout=60)
            self.index_doc(fname)
        self.index_words()

    def compile_translations(self):
        """
        Compile flask_babel translations in ../search/web_app.
        """
        pythonPath = ''
        for p in sys.path:
            if re.search('Python3[^/\\\\]*[/\\\\]?$', p) is not None:
                pythonPath = p
                break
        if len(pythonPath) <= 0:
            pyBabelPath = 'pybabel'
        else:
            pyBabelPath = os.path.join(pythonPath, 'Scripts', 'pybabel')
        try:
            subprocess.run(
                [pyBabelPath, 'compile', '-d', 'translations_pybabel'],
                cwd='../search/web_app',
                check=True)
        except:
            print('Could not compile translations with ' + pyBabelPath + ' .')
        else:
            print('Interface translations compiled.')

    def load_corpus(self):
        """
        Drop the current database, if any, and load the entire corpus.
        """
        t1 = time.time()
        # self.compile_translations()
        indicesDeleted = self.delete_indices()
        if not indicesDeleted:
            return
        self.create_indices()
        self.index_dir()
        t2 = time.time()
        print('Corpus indexed in', t2 - t1, 'seconds:', self.dID, 'documents,',
              self.sID, 'sentences,', self.totalNumWords, 'words,',
              sum(len(self.wordFreqs[i]) for i in range(len(self.languages))),
              'word types (different words).')
Exemple #6
0
class Indexator:
    """
    Contains methods for loading the JSON documents in the corpus
    database.
    """
    SETTINGS_DIR = '../conf'

    def __init__(self):
        f = open(os.path.join(self.SETTINGS_DIR, 'corpus.json'),
                 'r',
                 encoding='utf-8')
        self.settings = json.loads(f.read())
        f.close()
        self.name = self.settings['corpus_name']
        self.languages = self.settings['languages']
        if len(self.languages) <= 0:
            self.languages = [self.name]
        self.input_format = self.settings['input_format']
        self.corpus_dir = os.path.join('../corpus', self.name)
        self.iterSent = None
        if self.input_format in ['json', 'json-gzip']:
            self.iterSent = JSONDocReader(format=self.input_format)
        self.goodWordFields = [
            'lex', 'wf', 'wf_display', 'parts', 'gloss', 'gloss_index',
            'n_ana', 'trans_en', 'trans_ru'
        ]
        self.AdditionalWordFields = set()
        if 'word_fields' in self.settings:
            self.AdditionalWordFields |= set(self.settings['word_fields'])
        if 'word_table_fields' in self.settings:
            self.AdditionalWordFields |= set(
                self.settings['word_table_fields'])
        f = open(os.path.join(self.SETTINGS_DIR, 'categories.json'),
                 'r',
                 encoding='utf-8')
        categories = json.loads(f.read())
        self.goodWordFields += [
            'gr.' + v for lang in categories
            for v in categories[lang].values()
        ]
        self.goodWordFields = set(self.goodWordFields)
        f.close()
        self.pd = PrepareData()
        self.es = Elasticsearch()
        self.es_ic = IndicesClient(self.es)
        self.shuffled_ids = [i for i in range(1, 1000000)]
        random.shuffle(self.shuffled_ids)
        self.shuffled_ids.insert(0, 0)  # id=0 is special and should not change
        self.tmpWordIDs = [{} for i in range(len(self.languages))
                           ]  # word as JSON -> its integer ID
        self.tmpLemmaIDs = [{} for i in range(len(self.languages))
                            ]  # lemma as string -> its integer ID
        self.word2lemma = [{} for i in range(len(self.languages))
                           ]  # word's ID -> ID of its lemma (or -1, if none)
        self.wordFreqs = [{} for i in range(len(self.languages))
                          ]  # word's ID -> its frequency
        self.wordSFreqs = [{} for i in range(len(self.languages))
                           ]  # word's ID -> its number of sentences
        self.wordDocFreqs = [
            {} for i in range(len(self.languages))
        ]  # (word's ID, dID) -> word frequency in the document
        # self.wordSIDs = [{} for i in range(len(self.languages))]      # word's ID -> set of sentence IDs
        self.wordDIDs = [{} for i in range(len(self.languages))
                         ]  # word's ID -> set of document IDs
        self.wfs = set()  # set of word forms (for sorting)
        self.lemmata = set()  # set of lemmata (for sorting)
        self.sID = 0  # current sentence ID for each language
        self.dID = 0  # current document ID
        self.wID = 0  # current word ID
        self.wordFreqID = 0
        self.numWords = 0  # number of words in current document
        self.numSents = 0  # number of sentences in current document
        self.numWordsLang = [0] * len(
            self.languages
        )  # number of words in each language in current document
        self.numSentsLang = [0] * len(
            self.languages
        )  # number of sentences in each language in current document
        self.totalNumWords = 0

    def delete_indices(self):
        if self.es_ic.exists(index=self.name + '.docs'):
            self.es_ic.delete(index=self.name + '.docs')
        if self.es_ic.exists(index=self.name + '.words'):
            self.es_ic.delete(index=self.name + '.words')
        if self.es_ic.exists(index=self.name + '.word_freqs'):
            self.es_ic.delete(index=self.name + '.word_freqs')
        if self.es_ic.exists(index=self.name + '.sentences'):
            self.es_ic.delete(index=self.name + '.sentences')

    def create_indices(self):
        self.wordMapping = self.pd.generate_words_mapping()
        self.wordFreqMapping = self.pd.generate_wordfreq_mapping()
        self.sentMapping = self.pd.generate_sentences_mapping(self.wordMapping)
        self.docMapping = self.pd.generate_docs_mapping()
        self.es_ic.create(index=self.name + '.docs', body=self.docMapping)
        self.es_ic.create(index=self.name + '.words', body=self.wordMapping)
        self.es_ic.create(index=self.name + '.sentences',
                          body=self.sentMapping)

    def randomize_id(self, realID):
        """
        Return a (relatively) randomized sentence ID. This randomization
        is needed in context-aware word queries where the sentences
        are iterated in the order determined by their IDs.
        """
        if realID < 0:
            return realID
        idStart, idEnd = realID // 1000000, realID % 1000000
        return idStart * 1000000 + self.shuffled_ids[idEnd]

    def enhance_word(self, word):
        """
        Add some calculated fields to the JSON word.
        """
        if 'ana' not in word:
            word['n_ana'] = 0
        else:
            word['n_ana'] = len(word['ana'])
            if word['n_ana'] >= 127:
                word['n_ana'] = 127

    def process_sentence_words(self, words, langID):
        """
        Take words list from a sentence, remove all non-searchable
        fields from them and add them to self.words dictionary.
        Add w_id property to each word of the words list.
        """
        sIDAdded = set(
        )  # word IDs for which the current settence ID has been counted for it
        for w in words:
            if w['wtype'] != 'word':
                continue
            self.numWords += 1
            self.numWordsLang[langID] += 1
            self.totalNumWords += 1
            self.enhance_word(w)
            wClean = {'lang': langID}
            lemma = ''
            for field in w:
                if field in self.goodWordFields or field in self.AdditionalWordFields:
                    wClean[field] = w[field]
                    if field == 'wf':
                        if 'wf_lowercase' not in self.settings or self.settings[
                                'wf_lowercase']:
                            wClean[field] = wClean[field].lower()
                        self.wfs.add(wClean[field])
            if 'ana' in w:
                lemma = self.get_lemma(w)
                self.lemmata.add(lemma)
                wClean['ana'] = []
                for ana in w['ana']:
                    cleanAna = {}
                    for anaField in ana:
                        if anaField in self.goodWordFields or anaField in self.AdditionalWordFields:
                            cleanAna[anaField] = ana[anaField]
                    wClean['ana'].append(cleanAna)
            wCleanTxt = json.dumps(wClean, ensure_ascii=False, sort_keys=True)
            if wCleanTxt in self.tmpWordIDs[langID]:
                wID = self.tmpWordIDs[langID][wCleanTxt]
            else:
                wID = sum(
                    len(self.tmpWordIDs[i])
                    for i in range(len(self.languages)))
                self.tmpWordIDs[langID][wCleanTxt] = wID
            w['w_id'] = wID
            if len(lemma) > 0:
                try:
                    lemmaID = self.tmpLemmaIDs[langID][lemma]
                except KeyError:
                    lemmaID = sum(
                        len(self.tmpLemmaIDs[i])
                        for i in range(len(self.languages))) + 1
                    self.tmpLemmaIDs[langID][lemma] = lemmaID
                self.word2lemma[langID][wID] = lemmaID

            try:
                self.wordFreqs[langID][wID] += 1
            except KeyError:
                self.wordFreqs[langID][wID] = 1
            if wID not in sIDAdded:
                sIDAdded.add(wID)
                try:
                    self.wordSFreqs[langID][wID] += 1
                except KeyError:
                    self.wordSFreqs[langID][wID] = 1
            try:
                self.wordDIDs[langID][wID].add(self.dID)
            except KeyError:
                self.wordDIDs[langID][wID] = {self.dID}
            try:
                self.wordDocFreqs[langID][(wID, self.dID)] += 1
            except KeyError:
                self.wordDocFreqs[langID][(wID, self.dID)] = 1

    def sort_words(self):
        """
        Sort word forms and lemmata stored at earlier stages.
        Return dictionaries with positions of word forms and
        lemmata in the sorted list. Delete the original lists.
        """
        wfsSorted = {}
        iOrder = 0
        for wf in sorted(self.wfs):
            wfsSorted[wf] = iOrder
            iOrder += 1
        self.wfs = None
        lemmataSorted = {}
        iOrder = 0
        for l in sorted(self.lemmata):
            lemmataSorted[l] = iOrder
            iOrder += 1
        self.lemmata = None
        return wfsSorted, lemmataSorted

    def get_freq_ranks(self, freqsSorted):
        """
        Calculate frequency ranks and rank/quantile labels for words
        or lemmata.
        """
        freqToRank = {}
        quantiles = {}
        prevFreq = 0
        prevRank = 0
        for i in range(len(freqsSorted)):
            v = freqsSorted[i]
            if v != prevFreq:
                if prevFreq != 0:
                    freqToRank[prevFreq] = prevRank + (i - prevRank) // 2
                prevRank = i
                prevFreq = v
        if prevFreq != 0:
            freqToRank[prevFreq] = prevRank + (len(freqsSorted) -
                                               prevRank) // 2
        for q in [0.03, 0.04, 0.05, 0.1, 0.15, 0.2, 0.25, 0.5]:
            qIndex = math.ceil(q * len(freqsSorted))
            if qIndex >= len(freqsSorted):
                qIndex = len(freqsSorted) - 1
            if qIndex >= 0:
                quantiles[q] = freqsSorted[qIndex]
            else:
                quantiles[q] = 0
        return freqToRank, quantiles

    def quantile_label(self, freq, rank, quantiles):
        """
        Return a string label of the frequency rank (for frequent items)
        or quantile. This label is showed to the user in word query results.
        """
        if freq > 1 and freq >= quantiles[0.5]:
            if freq > quantiles[0.03]:
                return '#' + str(rank + 1)
            else:
                return '&gt; ' + str(
                    min(
                        math.ceil(q * 100)
                        for q in quantiles if freq >= quantiles[q])) + '%'
        return ''

    def get_lemma(self, word):
        """
        Join all lemmata in the JSON representation of a word with
        an analysis and return them as a string.
        """
        if 'ana' not in word:
            return ''
        if 'keep_lemma_order' not in self.settings or not self.settings[
                'keep_lemma_order']:
            curLemmata = set()
            for ana in word['ana']:
                if 'lex' in ana:
                    if type(ana['lex']) == list:
                        for l in ana['lex']:
                            curLemmata.add(l.lower())
                    else:
                        curLemmata.add(ana['lex'].lower())
            return '/'.join(l for l in sorted(curLemmata))
        curLemmata = []
        for ana in word['ana']:
            if 'lex' in ana:
                if type(ana['lex']) == list:
                    for l in ana['lex']:
                        curLemmata.append(l.lower())
                else:
                    curLemmata.append(ana['lex'].lower())
        return '/'.join(curLemmata)

    def iterate_lemmata(self, langID, lemmaFreqs, lemmaDIDs):
        """
        Iterate over all lemmata for one language collected at the
        word iteration stage.
        """
        lFreqsSorted = [v for v in sorted(lemmaFreqs.values(), reverse=True)]
        freqToRank, quantiles = self.get_freq_ranks(lFreqsSorted)
        iLemma = 0
        for l, lID in self.tmpLemmaIDs[langID].items():
            #if iLemma % 250 == 0:
            #   print('indexing lemma', iLemma)
            lemmaJson = {
                'wf':
                l,
                'freq':
                lemmaFreqs[lID],
                'rank_true':
                freqToRank[lemmaFreqs[lID]],
                'rank':
                self.quantile_label(lemmaFreqs[lID],
                                    freqToRank[lemmaFreqs[lID]], quantiles),
                'n_docs':
                len(lemmaDIDs[lID])
            }
            curAction = {
                '_index': self.name + '.words',
                '_type': 'lemma',
                '_id': lID,
                '_source': lemmaJson
            }
            iLemma += 1
            yield curAction

    def iterate_words(self):
        """
        Iterate through all words collected at the previous
        stage. Return JSON objects with actions for bulk indexing
        in Elasticsearch.
        """
        self.wID = 0
        wfsSorted, lemmataSorted = self.sort_words()

        for langID in range(len(self.languages)):
            iWord = 0
            #print('Processing words in ' + self.languages[langID] + '...')
            lemmaFreqs = {}  # lemma ID -> its frequency
            lemmaDIDs = {}  # lemma ID -> its document IDs
            wFreqsSorted = [
                v
                for v in sorted(self.wordFreqs[langID].values(), reverse=True)
            ]
            freqToRank, quantiles = self.get_freq_ranks(wFreqsSorted)
            # for wID in self.wordFreqs[langID]:
            for w, wID in self.tmpWordIDs[langID].items():
                #if iWord % 500 == 0:
                # print('indexing word', iWord)
                try:
                    lID = self.word2lemma[langID][wID]
                except KeyError:
                    lID = 0
                wJson = json.loads(w)
                wfOrder = len(wfsSorted) + 1
                if 'wf' in wJson:
                    wfOrder = wfsSorted[wJson['wf']]
                lOrder = len(lemmataSorted) + 1
                if 'ana' in wJson:
                    lOrder = lemmataSorted[self.get_lemma(wJson)]
                wJson['wf_order'] = wfOrder
                wJson['l_order'] = lOrder
                wordFreq = self.wordFreqs[langID][wID]
                wJson['freq'] = wordFreq
                try:
                    lemmaFreqs[lID] += wordFreq
                except KeyError:
                    lemmaFreqs[lID] = wordFreq
                if lID != 0:
                    try:
                        lemmaDIDs[lID] |= self.wordDIDs[langID][wID]
                    except KeyError:
                        lemmaDIDs[lID] = set(self.wordDIDs[langID][wID])
                # wJson['sids'] = [sid for sid in sorted(self.wordSIDs[langID][wID])]
                wJson['dids'] = [
                    did for did in sorted(self.wordDIDs[langID][wID])
                ]
                wJson['n_sents'] = self.wordSFreqs[langID][wID]
                wJson['n_docs'] = len(wJson['dids'])
                wJson['rank_true'] = freqToRank[
                    wJson['freq']]  # for the calculations
                wJson['rank'] = self.quantile_label(wJson['freq'],
                                                    wJson['rank_true'],
                                                    quantiles)  # for the user
                curAction = {
                    '_index': self.name + '.words',
                    '_type': 'word',
                    '_id': wID,
                    '_source': wJson,
                    '_parent': lID
                }
                yield curAction

                for docID in wJson['dids']:
                    wfreqJson = {
                        'w_id': wID,
                        'd_id': docID,
                        'wf_order': wfOrder,
                        'l_order': lOrder,
                        'freq': self.wordDocFreqs[langID][(wID, docID)]
                    }
                    curAction = {
                        '_index': self.name + '.words',
                        '_type': 'word_freq',
                        '_id': self.wordFreqID,
                        '_source': wfreqJson,
                        '_parent': wID,
                        '_routing': lID
                    }
                    self.wordFreqID += 1
                    yield curAction
                iWord += 1
                self.wID += 1
            for lAction in self.iterate_lemmata(langID, lemmaFreqs, lemmaDIDs):
                yield lAction
        emptyLemmaJson = {'wf': 0, 'freq': 0, 'rank_true': -1}
        curAction = {
            '_index': self.name + '.words',
            '_type': 'lemma',
            '_id': 0,
            '_source': emptyLemmaJson
        }
        yield curAction

    def index_words(self):
        """
        Index all words that have been collected at the previous stage
        in self.words (while the sentences were being indexed).
        """
        bulk(self.es, self.iterate_words(), chunk_size=300, request_timeout=60)

    def add_parallel_sids(self, sentences, paraIDs):
        """
        In the parallel corpus, add the IDs of aligned sentences in other languages
        to each sentence that has a para_id.
        """
        for s in sentences:
            if 'para_alignment' not in s['_source'] or 'lang' not in s[
                    '_source']:
                continue
            langID = s['_source']['lang']
            for pa in s['_source']['para_alignment']:
                paraID = pa['para_id']
                pa['sent_ids'] = []
                for i in range(len(self.languages)):
                    if i == langID:
                        continue
                    if paraID in paraIDs[i]:
                        pa['sent_ids'] += paraIDs[i][paraID]

    def iterate_sentences(self, fname):
        self.numSents = 0
        prevLast = False
        sentences = []
        paraIDs = [{} for i in range(len(self.languages))]
        for s, bLast in self.iterSent.get_sentences(fname):
            if 'lang' in s:
                langID = s['lang']
            else:
                langID = 0
                s['lang'] = langID
            s['n_words'] = 0
            if 'words' in s:
                self.process_sentence_words(s['words'], langID)
                s['n_words'] = sum(1 for w in s['words']
                                   if 'wtype' in w and w['wtype'] == 'word')
            if prevLast:
                prevLast = False
            elif self.numSents > 0:
                s['prev_id'] = self.randomize_id(self.sID - 1)
            if not bLast and 'last' not in s:
                s['next_id'] = self.randomize_id(self.sID + 1)
            else:
                prevLast = True
            s['doc_id'] = self.dID
            if 'meta' in s:
                for metaField in [
                        mf for mf in s['meta'].keys()
                        if not mf.startswith('year')
                ]:
                    s['meta'][metaField + '_kw'] = s['meta'][metaField]
            # self.es.index(index=self.name + '.sentences',
            #               doc_type='sentence',
            #               id=self.sID,
            #               body=s)
            curAction = {
                '_index': self.name + '.sentences',
                '_type': 'sentence',
                '_id': self.randomize_id(self.sID),
                '_source': s
            }
            if len(self.languages) <= 1:
                yield curAction
            else:
                sentences.append(curAction)
                if 'para_alignment' in s:
                    s['para_ids'] = []
                    for pa in s['para_alignment']:
                        paraID = str(self.dID) + '_' + str(pa['para_id'])
                        pa['para_id'] = paraID
                        s['para_ids'].append(paraID)
                        try:
                            paraIDs[langID][paraID].append(
                                self.randomize_id(self.sID))
                        except KeyError:
                            paraIDs[langID][paraID] = [
                                self.randomize_id(self.sID)
                            ]
            #if self.sID % 500 == 0:
            #   print('Indexing sentence', self.sID, ',', self.totalNumWords, 'words so far.')
            self.numSents += 1
            self.numSentsLang[langID] += 1
            self.sID += 1
        if len(self.languages) > 1:
            self.add_parallel_sids(sentences, paraIDs)
            for s in sentences:
                yield s

    @staticmethod
    def add_meta_keywords(meta):
        """
        For each text field in the metadata, add a keyword version
        of the same field.
        """
        for field in [k for k in meta.keys() if not k.startswith('year')]:
            meta[field + '_kw'] = meta[field]

    def index_doc(self, fname):
        """
        Store the metadata of the source file.
        """
        #if self.dID % 100 == 0:
        #print('Indexing document', self.dID)
        meta = self.iterSent.get_metadata(fname)
        self.add_meta_keywords(meta)
        meta['n_words'] = self.numWords
        meta['n_sents'] = self.numSents
        if len(self.settings['languages']) > 1:
            for i in range(len(self.languages)):
                meta['n_words_' + self.languages[i]] = self.numWordsLang[i]
                meta['n_sents_' + self.languages[i]] = self.numSentsLang[i]
        self.numWords = 0
        self.numSents = 0
        self.numWordsLang = [0] * len(self.languages)
        self.numSentsLang = [0] * len(self.languages)
        try:
            self.es.index(index=self.name + '.docs',
                          doc_type='doc',
                          id=self.dID,
                          body=meta)
        except RequestError as err:
            #print('Metadata error: {0}'.format(err))
            shortMeta = {}
            if 'filename' in meta:
                shortMeta['filename'] = meta['filename']
            if 'title' in meta:
                shortMeta['title'] = meta['title']
                shortMeta['title_kw'] = meta['title']
                self.es.index(index=self.name + '.docs',
                              doc_type='doc',
                              id=self.dID,
                              body=shortMeta)
        self.dID += 1

    def index_dir(self):
        """
        Index all files from the corpus directory, sorted by their size
        in decreasing order. Such sorting helps prevent memory errors
        when indexing large corpora, as the default behavior is to load
        the whole file is into memory, and there is more free memory
        in the beginning of the process. If MemoryError occurs, the
        iterative JSON parser is used, which works much slower.
        """
        filenames = []
        for root, dirs, files in os.walk(self.corpus_dir):
            for fname in files:
                if (not ((self.settings['input_format'] == 'json'
                          and fname.lower().endswith('.json')) or
                         (self.settings['input_format'] == 'json-gzip'
                          and fname.lower().endswith('.json.gz')))):
                    continue
                fnameFull = os.path.join(root, fname)
                filenames.append((fnameFull, os.path.getsize(fnameFull)))
        if len(filenames) <= 0:
            print('There are no files in this corpus.')
            return
        for fname, fsize in sorted(filenames, key=lambda p: -p[1]):
            # print(fname, fsize)
            bulk(self.es,
                 self.iterate_sentences(fname),
                 chunk_size=200,
                 request_timeout=60)
            self.index_doc(fname)
        self.index_words()

    def compile_translations(self):
        """
        Compile flask_babel translations in ../search/web_app.
        """
        pythonPath = ''
        for p in sys.path:
            if re.search('Python3[^/\\\\]*[/\\\\]?$', p) is not None:
                pythonPath = p
                break
        if len(pythonPath) <= 0:
            pyBabelPath = 'pybabel'
        else:
            pyBabelPath = os.path.join(pythonPath, 'Scripts', 'pybabel')
        try:
            subprocess.run([pyBabelPath, 'compile', '-d', 'translations'],
                           cwd='../search/web_app',
                           check=True)
        except:
            print('Could not compile translations with ' + pyBabelPath + ' .')
        else:
            print('Interface translations compiled.')

    def load_corpus(self):
        """
        Drop the current database, if any, and load the entire corpus.
        """
        t1 = time.time()
        self.compile_translations()
        self.delete_indices()
        self.create_indices()
        self.index_dir()
        t2 = time.time()
        print('Corpus indexed in', t2 - t1, 'seconds:', self.dID, 'documents,',
              self.sID, 'sentences,', self.totalNumWords, 'words,',
              sum(len(self.wordFreqs[i]) for i in range(len(self.languages))),
              'word types (different words).')
class CorpusTransformer:
    """
    Contains methods for transforming tsakorpus JSON files
    into plain text files.
    """
    rxMultipleSpaces = re.compile('  +')

    def __init__(self,
                 input_format='json',
                 lang='kpv',
                 langCode=0,
                 minAnalyzed=0.66,
                 alphabet='[а-яёӧі -]'):
        """
        Only add sentences where the "lang" attribute equals langCode.
        OExclude sentences where the share of analyzed words is less
        than minAnalyzed.
        """
        self.lang = lang
        self.input_format = input_format
        self.langCode = langCode
        self.minAnalyzed = minAnalyzed
        self.lp = LanguageProcessor(self.lang)
        self.iterSent = None
        self.rxAlphabet = re.compile('^' + alphabet + '+$')
        if self.input_format not in ['json', 'json-gzip']:
            print('Format should equal either "json" or "json-gzip".')
        else:
            self.iterSent = JSONDocReader(format=self.input_format)

    def process_sentences(self, fname):
        """
        Iterate over sentences in a JSON file. For each suitable
        sentence, return its normalized text representation.
        """
        for s, bLast in self.iterSent.get_sentences(fname):
            if 'lang' in s:
                langID = s['lang']
            else:
                langID = 0
            if langID != self.langCode:
                continue
            if 'words' not in s or len(s['words']) <= 0:
                continue
            nWords = sum(1 for token in s['words'] if token['wtype'] == 'word')
            if nWords <= 0:
                continue
            nAnalyzed = sum(1 for token in s['words']
                            if token['wtype'] == 'word' and 'ana' in token
                            and len(token['ana']) > 0)
            if nAnalyzed / nWords < self.minAnalyzed:
                continue
            sentOut = ''
            for w in s['words']:
                if w['wtype'] != 'word':
                    continue
                if len(w['wf']) == 1 and ('ana' not in w
                                          or len(w['ana']) <= 0):
                    # we need as few one-letter words as possible
                    continue
                sentOut += self.lp.process_word(w['wf']) + ' '
            sentOut = CorpusTransformer.rxMultipleSpaces.sub(
                ' ', sentOut.strip())
            if self.rxAlphabet.search(sentOut) is not None:
                yield sentOut
            # else:
            #     print(sentOut)

    def extract_data(self, dir_in, dir_out):
        """
        Read all .json or .json.gz files in the dir_in folder and
        write their sentences to dir_out.
        """
        if dir_out == dir_in:
            return
        filenames = []
        for root, dirs, files in os.walk(dir_in):
            for fname in files:
                if (not ((self.input_format == 'json'
                          and fname.lower().endswith('.json')) or
                         (self.input_format == 'json-gzip'
                          and fname.lower().endswith('.json.gz')))):
                    continue
                fnameFull = os.path.join(root, fname)
                filenames.append((fnameFull, os.path.getsize(fnameFull)))
        if len(filenames) <= 0:
            print('There are no files in this corpus.')
            return
        sentences = []
        nWords = 0
        for fname, fsize in sorted(filenames, key=lambda p: -p[1]):
            print(fname, fsize)
            for s in self.process_sentences(fname):
                if len(s) > 2:
                    sentences.append(s)
                    nWords += s.count(' ') + 1
        fOut = open(os.path.join(dir_out, self.lang) + '.txt',
                    'w',
                    encoding='utf-8',
                    newline='\n')
        for s in sorted(sentences):
            fOut.write(s + '\n')
        fOut.close()
        print('Conversion complete, ' + str(len(sentences)) +
              ' sentences written, ' + str(nWords) + ' tokens total.')
Exemple #8
0
class JSON2HTML:
    """
    Contains methods for translating annotated JSON files into
    HTML files, provided that the corpus settings allow full-text view.
    """
    SETTINGS_DIR = '../conf'

    def __init__(self, settings):
        self.settings = CorpusSettings()
        self.settings.load_settings(
            os.path.join(self.SETTINGS_DIR, 'corpus.json'),
            os.path.join(self.SETTINGS_DIR, 'categories.json'))
        self.sentView = SentenceViewer(self.settings, None, fullText=True)
        self.iterSent = None
        if self.settings.input_format in ['json', 'json-gzip']:
            self.iterSent = JSONDocReader(format=self.settings.input_format,
                                          settings=settings)
        self.lastSentNum = 0  # for the IDs in the HTML

    def finalize_html_sentence(self, sent):
        """
        Add span tags etc. to a sentence in HTML and clean it.
        """
        # sent = sent.replace('<span class="newline"></span>', '<br>')
        sent = re.sub('^[\n ]*<br> *', '', sent, flags=re.DOTALL)
        sent = re.sub('\n\n+', '\n', sent, flags=re.DOTALL)
        sent = re.sub('  +', ' ', sent)
        return sent

    def finalize_html_paragraph(self, sentByTier, colClass, paraNum):
        """
        Make one HTML paragraph with parallel sentences.
        """
        remainingCol = max(2, 12 - colClass * len(sentByTier))
        paragraph = '<div class="d-none d-sm-block col-md-' + str(
            remainingCol // 2) + '"></div>'
        paragraph += '<div class="paragraph_num">'
        if paraNum % 10 == 0:
            paragraph += '<div>' + str(paraNum) + '</div>'
        paragraph += '</div>\n'
        for iTier in range(len(sentByTier)):
            sent = sentByTier[iTier]
            sent = re.sub('(?<=class="word)(.*)',
                          lambda m: m.group(1).replace(
                              '<span class="newline"></span>', '<br>'),
                          sent,
                          flags=re.DOTALL)
            sent = '<div class="col-sm-' + str(colClass) \
                   + '"><span class="sent_lang sent_lang_lang' + str(iTier) \
                   + '" id="res1_lang' + str(iTier) + '">' \
                   + sent + '</span></div>\n'
            paragraph += sent
        return paragraph

    def process_file(self, fnameIn, fnameOut):
        """
        Read one JSON file (fnameIn). Generate an HTML representation for it
        and store it in fnameOut.
        """
        htmlByTier = [[]]
        nTier = 0
        paraIDsByTier = [set()]
        for s, bLast in self.iterSent.get_sentences(fnameIn):
            if 'lang' in s:
                langID = s['lang']
            else:
                langID = 0
                s['lang'] = langID
            curParaIDs = []
            if 'para_alignment' in s:
                for para in s['para_alignment']:
                    if 'para_id' in para:
                        curParaIDs.append(para['para_id'])
            s['doc_id'] = '0'
            s = {'_source': s}
            self.lastSentNum += 1
            lang = self.settings.languages[langID]
            sentProcessed = self.sentView.process_sentence(
                s,
                numSent=self.lastSentNum,
                lang=lang,
                langView='lang' + str(nTier))
            if len(sentProcessed['languages']['lang' +
                                              str(nTier)]['text']) > 0:
                curSentData = {
                    'html':
                    sentProcessed['languages']['lang' + str(nTier)]['text'] +
                    ' \n',
                    'para_ids':
                    curParaIDs
                }
                htmlByTier[nTier].append(curSentData)
                paraIDsByTier[nTier] |= set(curSentData['para_ids'])
            if bLast or ('last' in s['_source'] and s['_source']['last']):
                nTier += 1
                htmlByTier.append([])
                paraIDsByTier.append(set())

        # Remove empty tiers
        for iTier in range(len(htmlByTier) - 1, -1, -1):
            if (len(htmlByTier[iTier]) <= 0 or all(
                    len(sent['html'].strip()) <= 0
                    for sent in htmlByTier[iTier])):
                del htmlByTier[iTier]
                del paraIDsByTier[iTier]
        nTiers = len(htmlByTier)

        colClass = 8
        if nTiers > 1:
            colClass = max(2, 10 // nTiers)

        curPointers = [0] * nTiers
        usedParaIDsByTier = [set() for _ in range(nTiers)]
        dataFinal = {'rows': [], 'meta': self.iterSent.get_metadata(fnameIn)}

        fname = ''
        if 'fulltext_id' in dataFinal['meta']:
            fname = secure_filename(dataFinal['meta']['fulltext_id'])
        if len(fname) <= 0:
            return

        while curPointers[0] < len(htmlByTier[0]):
            curParagraph = [''] * nTiers
            curParagraph[0] = self.finalize_html_sentence(
                htmlByTier[0][curPointers[0]]['html'])
            curParaIDs = set(htmlByTier[0][curPointers[0]]['para_ids'])
            for iTier in range(1, nTiers):
                remainingParaIDs = (paraIDsByTier[iTier]
                                    & curParaIDs) - usedParaIDsByTier[iTier]
                while len(remainingParaIDs) > 0 and curPointers[iTier] < len(
                        htmlByTier[iTier]):
                    curParagraph[iTier] += self.finalize_html_sentence(
                        htmlByTier[iTier][curPointers[iTier]]['html'])
                    usedParaIDsByTier[iTier] |= set(
                        htmlByTier[iTier][curPointers[iTier]]['para_ids'])
                    remainingParaIDs -= set(
                        htmlByTier[iTier][curPointers[iTier]]['para_ids'])
                    curPointers[iTier] += 1
            dataFinal['rows'].append(
                self.finalize_html_paragraph(curParagraph, colClass,
                                             curPointers[0] + 1))
            curPointers[0] += 1

        if not os.path.exists(os.path.dirname(fnameOut)):
            os.makedirs(os.path.dirname(fnameOut))
        with open(fnameOut, 'w', encoding='utf-8') as fOut:
            json.dump(dataFinal, fOut, indent=1, ensure_ascii=False)