Beispiel #1
0
    def generate_pivot(self, sdoc, tdoc):
        if sdoc['source_id'] != tdoc['source_id']:
            logging.error(
                "Invalid pair for pivot generation: sdoc {}, tdoc {}".format(
                    sdoc, tdoc))
        assert (sdoc['source_id'] == tdoc['source_id']
                )  # make sure pivot exists
        # Result doc
        doc = dict()
        for attr in ['id', 'language', 'text']:
            doc['source_' + attr] = sdoc['target_' + attr]
            doc['target_' + attr] = tdoc['target_' + attr]
        for attr in TMDbQuery.str_attrs:
            if not attr in sdoc: continue
            # TODO: should it be union or intersection?
            doc[attr] = sdoc[attr] + tdoc[attr] if sdoc.get(attr) and tdoc.get(
                attr) else None

        for attr in [
                'tm_creation_date', 'tm_change_date', 'insert_date',
                'update_date'
        ]:
            doc[attr] = TMUtils.date2str(datetime.datetime.now())
        doc['check_date'] = TMUtils.date2str(datetime.datetime(1970, 1, 1))
        return doc
Beispiel #2
0
  def output_segment(self, segment):
    e = etree.Element('tu')
    e.set('srclang', TMUtils.list2str(segment.source_language))
    dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now())
    e.set('creationdate', dt)
    dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now())
    e.set('changedate', dt)
    if segment.tuid:
      e.set('tuid', str(segment.tuid))
    if segment.username:
      e.set('creationid', segment.username)

    if segment.industry:
      etree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = self.list2str(segment.industry)
    if segment.type:
      etree.SubElement(e, 'prop', {'type' : "tda-type"}).text = self.list2str(segment.type)
    if segment.organization:
      etree.SubElement(e, 'prop', {'type' : "tda-org"}).text = self.list2str(segment.organization)
      etree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default"
    if segment.metadata:
      for prop_type,prop_text in segment.metadata.items():
        if not prop_type.startswith('tda-'): # skip already handled props
          etree.SubElement(e, 'prop', {'type': prop_type}).text = prop_text

    for t in ['source', 'target']:
      tuv = etree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))})
      if getattr(segment, t + '_pos'):
        etree.SubElement(tuv, 'prop', {'type': "pos"}).text = getattr(segment, t + '_pos')
      if getattr(segment, t + '_metadata'):
        for prop_type, prop_text in getattr(segment, t + '_metadata').items():
          etree.SubElement(tuv, 'prop', {'type': prop_type}).text = prop_text

      etree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text')

    return e
Beispiel #3
0
 def get(self, source_id, source_lang, target_lang):
     src_index = TMUtils.lang2es_index(source_lang)
     tgt_index = TMUtils.lang2es_index(target_lang)
     m_index = TMUtils.es_index2mapdb(src_index, tgt_index)
     m_results = self.mongo_db[m_index].find({'source_id': source_id})
     if not m_results or not m_results.count():
         return None
     return m_results[0]['target_id']
Beispiel #4
0
 def add_segment(self, segment):
     # Add MongoDB document
     m_index = TMUtils.es_index2mapdb(
         TMUtils.lang2es_index(segment.source_lang),
         TMUtils.lang2es_index(segment.target_lang))
     # TODO: do not update if creation date is older than existing one
     m_result = self.mongo_db[m_index].update_one(
         {'source_id': segment.source_id},
         {'$set': self._segment2doc(segment)},
         upsert=True)  # insert if doesn't exist
     return m_result
Beispiel #5
0
  def _segment2doc(self, segment, ftype):
    text_pos = getattr(segment, ftype + '_pos')
    doc = {'text': getattr(segment, ftype + '_text')}
    # Optional fields (POS, tokenized)
    if hasattr(segment, ftype + '_pos'):
      doc['pos'] = getattr(segment, ftype + '_pos')

    op_ftype = 'source' if ftype == 'target' else 'target'
    # Auxiliary field to facilitate language matrix generation
    doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])]
    doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language'))
    return doc
Beispiel #6
0
    def add_segments(self, segments):
        if not segments:
            return
        m_index = TMUtils.es_index2mapdb(
            TMUtils.lang2es_index(segments[0].source_lang),
            TMUtils.lang2es_index(segments[0].target_lang))
        try:
            db = self.server[m_index]
        except:
            db = self.server.create(m_index)

        return db.update([self._segment2doc(s) for s in segments])
Beispiel #7
0
 def get(self, source_id, source_lang, target_lang):
     tname = TMUtils.es_index2mapdb(TMUtils.lang2es_index(source_lang),
                                    TMUtils.lang2es_index(target_lang))
     if not tname in self.tables:
         raise Exception("Language pair : {} - {} doesn't exist".format(
             source_lang, target_lang))
     # TODO: implement bidirectional query
     t = self.tables[tname]
     res = self.conn.execute(
         t.select(t.target_id).where(t.source_id == source_id))
     if res:
         return res.fetchone()[0]
     return None
Beispiel #8
0
  def get(self, lang, id):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return

    hit = self.es.get(index=index, id=id)
    if not hit: return None
    return hit['_source']
Beispiel #9
0
  def _fill_lang(self, tuv, seg):
    d = dict()
    d['source_language'] = TMUtils.lang2short(self._get_lang(tuv[0]))#tuv[0].attrib.get('lang')#get('{%s}lang' % self.NS)
    d['target_language'] = TMUtils.lang2short(self._get_lang(tuv[1])) #tuv[1].attrib.get('lang')#get('{%s}lang' % self.NS)

    d['source_text'] = self._get_text(seg[0])
    if isinstance(d['source_text'], bytes):
      d['source_text'] = d['source_text'].decode('utf8').encode('utf8')

    d['target_text'] = self._get_text(seg[1])
    if isinstance(d['target_text'], bytes):
      d['target_text'] = d['target_text'].decode('utf8').encode('utf8')

    d['source_metadata'] = self._parse_metadata(tuv[0])
    d['target_metadata'] = self._parse_metadata(tuv[1])
    return d
Beispiel #10
0
    def __call__(self, index, segments_iter):
        # Import should be inside the function to avoid serializing all pos tagger dependencies
        # for parallel execution
        sys.path.append(
            os.path.join(os.path.abspath(os.path.dirname(__file__)), '..',
                         '..'))
        sys.path = [p for p in sys.path if p]
        from TMPosTagger.TMPosTagger import TMPosTagger

        # Cache all segments. Though it might be expensive in terms of memory, but we need
        # to gather all texts for POS tagger batch and then store back
        # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently
        # large number of Spark jobs
        segments = [s for s in segments_iter]
        # Initialize PosTaggers for source and target languages
        pos_taggers = [
            TMPosTagger(lang.split('-')[0], universal=self.is_universal)
            for lang in self.langs
        ]
        # Invoke POS taggers for source and target segments
        src_texts = pos_taggers[0].tag_segments(
            [XmlUtils.replace_tags(s.source_text) for s in segments])
        tgt_texts = pos_taggers[1].tag_segments(
            [XmlUtils.replace_tags(s.target_text) for s in segments])
        # Store POS tags with XML tags as a training data. TODO: make it optional
        f = open(
            tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        iobs = open(
            tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format(
                TMUtils.date2str(datetime.datetime.now()), index), 'w')
        for s, stext, ttext in zip(segments, src_texts, tgt_texts):
            s.source_pos = self.tags2string(stext)
            s.target_pos = self.tags2string(ttext)
            # Write POS tags (+XML tags) to text file to be used as a training data
            if re.match(XmlUtils.TAG_PATTERN, s.source_text):
                f.write("{}\n{}\n\n".format(
                    self.tags2string_xml_tags(s.source_text, stext),
                    self.tags2string_xml_tags(s.target_text, ttext)))
                for s, t in zip(
                        self.tags2string_iob_tags(s.source_text, stext),
                        self.tags2string_iob_tags(s.target_text, ttext)):
                    iobs.write("{}\n{}\n\n".format(s, t))

        f.close()
        iobs.close()
        return segments
Beispiel #11
0
  def scan(self, lang, filter = None):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return

    query = TMDbQuery(es=self.es, index = index, filter=filter)
    for hit in query.scan():
      # Build segment by querying map and target index
      yield hit
Beispiel #12
0
    def add_segments(self, segments):
        bulk = None
        for segment in segments:
            if not bulk:
                m_index = TMUtils.es_index2mapdb(
                    TMUtils.lang2es_index(segment.source_lang),
                    TMUtils.lang2es_index(segment.target_lang))
                bulk = self.mongo_db[m_index].initialize_unordered_bulk_op()
            bulk.find({'source_id': segment.source_id}) \
              .update_one({'$set': self._segment2doc(segment) })

        try:
            result = bulk.execute()
        except BulkWriteError as bwe:
            result = bwe.details
            logging.error(bwe.details)
        return result
Beispiel #13
0
    def _segment2doc(self, segment):
        # Initialize/update DB date fields
        now_str = TMUtils.date2str(datetime.datetime.now())
        if not segment.insert_date: segment.insert_date = now_str
        if not segment.check_date:
            segment.check_date = TMUtils.date2str(datetime.datetime(
                1970, 1, 1))
        segment.update_date = now_str

        return {
            'source_id': segment.source_id,
            'target_id': segment.target_id,
            'source_text': segment.source_text,
            'target_text': segment.target_text,
            'source_language': segment.source_language,
            'target_language': segment.target_language,
            'source_metadata': segment.source_metadata,
            'target_metadata': segment.target_metadata,
            'metadata': segment.metadata,
            'tuid': segment.tuid,
            'industry': TMUtils.str2list(segment.industry),
            'type': TMUtils.str2list(segment.type),
            'organization': TMUtils.str2list(segment.organization),
            'file_name': TMUtils.str2list(segment.file_name),
            'domain': TMUtils.str2list(segment.domain),
            'tm_creation_date': segment.tm_creation_date,
            'tm_change_date': segment.tm_change_date,
            'insert_date': segment.insert_date,
            'update_date': segment.update_date,
            'check_date': segment.check_date,
            'check_version': segment.check_version,
            'dirty_score': segment.dirty_score,
            'username': segment.username
        }
Beispiel #14
0
  def scan_pivot(self, pivot_lang, langs):
    index = TMUtils.lang2es_index(pivot_lang)
    if not self.index_exists(index): return

    search = Search(using=self.es, index=index)
    for lang in langs:
      search = search.query('match', target_language=lang)
    for result in search.scan():
      yield result.meta.id
Beispiel #15
0
 def mget(self, ids_lang):
   if not ids_lang: return []
   body = [{
       '_index': TMUtils.lang2es_index(lang),
       '_id' : id
     } for lang,id in ids_lang]
   hits = self.es.mget(body={'docs' : body})
   if not hits: return None
   return [hit.get('_source',None) for hit in hits['docs']]
Beispiel #16
0
 def add_segment(self, segment, ftype):
   # Add segment source and target texts to the correspondent index of ElasticSearch
   id = getattr(segment, ftype + '_id')
   index = TMUtils.lang2es_index(getattr(segment, ftype + '_language'))
   s_result = self.es.index(index=index,
                            doc_type=self.DOC_TYPE,
                            id=id,
                            body = self._segment2doc(segment, ftype))
   return id
Beispiel #17
0
 def _segment2table(self, segment, suffix=None):
     tname = TMUtils.es_index2mapdb(
         TMUtils.lang2es_index(segment.source_lang),
         TMUtils.lang2es_index(segment.target_lang))
     if suffix: tname += suffix
     if not tname in self.tables:
         md = MetaData()
         self.tables[tname] = Table(tname,
                                    md,
                                    Column('id', Integer, primary_key=True),
                                    Column('source_id', GUID, index=True),
                                    Column('target_id', GUID, index=True),
                                    Column('creation_date', TIMESTAMP),
                                    Column('change_date', TIMESTAMP),
                                    mysql_engine='InnoDB',
                                    mysql_charset='utf8')
         md.bind = self.conn
         self.tables[tname].create(checkfirst=True)
     return self.tables[tname]
Beispiel #18
0
  def mquery(self, lang, limit, q_list, filter=None):
    index = TMUtils.lang2es_index(lang)
    if not self.index_exists(index): return
    # Query source ES for the text
    query = TMDbQuery(es=self.es,
                          index=index,
                          q=q_list,
                          filter=filter,
			                    limit=limit)
    for response, q in query():
      yield response
Beispiel #19
0
 def query(self, lang, qstring, filter = None):
   index = TMUtils.lang2es_index(lang)
   if not self.index_exists(index): return
   # Query source ES for the text
   query = TMDbQuery(es=self.es,
                     index = index,
                     q=qstring,
                     filter=filter)
   for response,q in query():
     for hit in response:
       yield hit,q
Beispiel #20
0
 def __call__(self, segment):
     score = 0
     for lang, type in zip(self.langs, ['source', 'target']):
         if not self.rule_langs or self.rule_langs == lang:
             det_lang, prob = TMUtils.detect_lang(
                 getattr(segment, type + '_text'), [lang])
             # If language probability is lower than the threshold,
             # penalize segment with the score proportional to probability
             if prob < self.rule_dict.get('threshold',
                                          LangidRule.DEFAULT_THRESHOLD):
                 score += self.rule_dict.get(
                     'score', self.DEFAULT_SCORE) * (1 / prob)
     return score
Beispiel #21
0
 def init_job(self, job_id=None, username=None, type='default', **kwargs):
   doc = {
          'id': job_id,
          'type': type,
          'username': username,
          'status': 'pending',
          'submit_time': TMUtils.date2str(datetime.datetime.now())
          }
   if not job_id: job_id = self._allocate_id()
   # Put params into the doc
   doc['params'] = kwargs
   self.update_job(job_id, doc)
   return id
Beispiel #22
0
  def output_segment(self, segment):
    e = ElementTree.Element('tu')
    e.set('srclang', TMUtils.list2str(segment.source_language))
    dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now())
    e.set('creationdate', dt)
    dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now())
    e.set('changedate', dt)
    if segment.tuid:
      e.set('tuid', str(segment.tuid))

    if segment.industry:
      ElementTree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = segment.industry[0]
    if segment.type:  
      ElementTree.SubElement(e, 'prop', {'type' : "tda-type"}).text = segment.type[0]
    if segment.organization:
      ElementTree.SubElement(e, 'prop', {'type' : "tda-org"}).text = segment.organization[0]
    ElementTree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default"

    for t in ['source', 'target']:
      tuv = ElementTree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))})
      ElementTree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text')

    return e
Beispiel #23
0
  def _gen_lang_pairs(self, tuv, seg):
    # Get all languages in the given tu
    lang_map = dict()
    for tu, seg in zip(tuv, seg):
      #lang = TMUtils.lang2short(tu.attrib.get('{%s}lang' % self.NS))
      lang = TMUtils.lang2short((self._get_lang(tu)))#(tu.attrib.get('lang'))
      if not lang in lang_map: lang_map[lang] = []
      lang_map[lang].append((tu, seg))

    # Generate all requested pairs (note: one tu can contain multiple translations for
    # the same language
    for s_lang, t_lang in self.lang_pairs:
      for s_tuv, s_seg in lang_map.get(s_lang, []):
        for t_tuv, t_seg in lang_map.get(t_lang, []):
          yield self._fill_lang((s_tuv, t_tuv), (s_seg, t_seg))
Beispiel #24
0
  def delete(self, lang, ids):
    index = TMUtils.lang2es_index(lang)

    actions = [{'_op_type': 'delete',
                '_id': id,
                '_index' : index,
                '_type': self.DOC_TYPE,
                } for id in ids]
    # Bulk delete
    try:
      status = helpers.bulk(self.es, actions)
    except Exception as e:
      logging.warning(e)
      return str(e)
    return status
Beispiel #25
0
 def _segment2es_bulk(self, segments, ftype, op_type, f_action):
   # Add segment source and target texts to the correspondent index of ElasticSearch in a batch
   actions = []
   added_ids = set()
   for segment in segments:
     id = getattr(segment, ftype + '_id')
     if id in added_ids: continue # avoid duplicates in the same batch
     added_ids.add(id)
     index = TMUtils.lang2es_index(getattr(segment, ftype + '_language'))
     action = {'_id': id,
               '_index' : index,
               '_type' : self.DOC_TYPE,
               '_op_type': op_type,
               '_source' : f_action(segment, ftype) #self._segment2doc(segment, ftype)
               }
     actions.append(action)
   # Bulk insert
   logging.info("Bulk upsert: {}".format(actions))
   s_result = helpers.bulk(self.es, actions)
   self.refresh() # refresh list of indexes (could have been created during insert)
   return s_result
Beispiel #26
0
    def _get_index(self, source_lang, target_lang, create_missing=False):
        m_index = TMUtils.es_index2mapdb(TMUtils.lang2es_index(source_lang),
                                         TMUtils.lang2es_index(target_lang))

        if self.es.indices.exists(index=m_index): return m_index, False
        # Try reverse index
        r_index = TMUtils.es_index2mapdb(TMUtils.lang2es_index(target_lang),
                                         TMUtils.lang2es_index(source_lang))
        # Found reverse index - use it
        if self.es.indices.exists(r_index): return r_index, True
        if not create_missing: return None, None
        # Neither direct, nor reverse index exist - create a direct one
        try:
            self.es.indices.create(m_index)
        except:
            pass
        self.refresh_lang_graph()
        return m_index, False
Beispiel #27
0
 def get(self, source_id, source_lang, target_lang):
     m_index = TMUtils.es_index2mapdb(TMUtils.lang2es_index(source_lang),
                                      TMUtils.lang2es_index(target_lang))
     doc = self.server[m_index].get(source_id.hex)
     if doc: return uuid.UUID(doc['target_id'])
     return None
Beispiel #28
0
 def add_segment(self, segment):
     m_index = TMUtils.es_index2mapdb(
         TMUtils.lang2es_index(segment.source_lang),
         TMUtils.lang2es_index(segment.target_lang))
     return self.server[m_index].update([self._segment2doc(segment)])
Beispiel #29
0
 def __call__(self, index, segments_iter):
   for segment in segments_iter:
     segment.check_date = TMUtils.date2str(datetime.datetime.now())
     segment.check_version = self.version
     yield segment
Beispiel #30
0
 def get_langs(self):
   return [TMUtils.es_index2lang(l) for l in self.indexes if re.search('^tm_\w{2}$', l)]