def generate_pivot(self, sdoc, tdoc): if sdoc['source_id'] != tdoc['source_id']: logging.error( "Invalid pair for pivot generation: sdoc {}, tdoc {}".format( sdoc, tdoc)) assert (sdoc['source_id'] == tdoc['source_id'] ) # make sure pivot exists # Result doc doc = dict() for attr in ['id', 'language', 'text']: doc['source_' + attr] = sdoc['target_' + attr] doc['target_' + attr] = tdoc['target_' + attr] for attr in TMDbQuery.str_attrs: if not attr in sdoc: continue # TODO: should it be union or intersection? doc[attr] = sdoc[attr] + tdoc[attr] if sdoc.get(attr) and tdoc.get( attr) else None for attr in [ 'tm_creation_date', 'tm_change_date', 'insert_date', 'update_date' ]: doc[attr] = TMUtils.date2str(datetime.datetime.now()) doc['check_date'] = TMUtils.date2str(datetime.datetime(1970, 1, 1)) return doc
def output_segment(self, segment): e = etree.Element('tu') e.set('srclang', TMUtils.list2str(segment.source_language)) dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now()) e.set('creationdate', dt) dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now()) e.set('changedate', dt) if segment.tuid: e.set('tuid', str(segment.tuid)) if segment.username: e.set('creationid', segment.username) if segment.industry: etree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = self.list2str(segment.industry) if segment.type: etree.SubElement(e, 'prop', {'type' : "tda-type"}).text = self.list2str(segment.type) if segment.organization: etree.SubElement(e, 'prop', {'type' : "tda-org"}).text = self.list2str(segment.organization) etree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default" if segment.metadata: for prop_type,prop_text in segment.metadata.items(): if not prop_type.startswith('tda-'): # skip already handled props etree.SubElement(e, 'prop', {'type': prop_type}).text = prop_text for t in ['source', 'target']: tuv = etree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))}) if getattr(segment, t + '_pos'): etree.SubElement(tuv, 'prop', {'type': "pos"}).text = getattr(segment, t + '_pos') if getattr(segment, t + '_metadata'): for prop_type, prop_text in getattr(segment, t + '_metadata').items(): etree.SubElement(tuv, 'prop', {'type': prop_type}).text = prop_text etree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text') return e
def get(self, source_id, source_lang, target_lang): src_index = TMUtils.lang2es_index(source_lang) tgt_index = TMUtils.lang2es_index(target_lang) m_index = TMUtils.es_index2mapdb(src_index, tgt_index) m_results = self.mongo_db[m_index].find({'source_id': source_id}) if not m_results or not m_results.count(): return None return m_results[0]['target_id']
def add_segment(self, segment): # Add MongoDB document m_index = TMUtils.es_index2mapdb( TMUtils.lang2es_index(segment.source_lang), TMUtils.lang2es_index(segment.target_lang)) # TODO: do not update if creation date is older than existing one m_result = self.mongo_db[m_index].update_one( {'source_id': segment.source_id}, {'$set': self._segment2doc(segment)}, upsert=True) # insert if doesn't exist return m_result
def _segment2doc(self, segment, ftype): text_pos = getattr(segment, ftype + '_pos') doc = {'text': getattr(segment, ftype + '_text')} # Optional fields (POS, tokenized) if hasattr(segment, ftype + '_pos'): doc['pos'] = getattr(segment, ftype + '_pos') op_ftype = 'source' if ftype == 'target' else 'target' # Auxiliary field to facilitate language matrix generation doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])] doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language')) return doc
def add_segments(self, segments): if not segments: return m_index = TMUtils.es_index2mapdb( TMUtils.lang2es_index(segments[0].source_lang), TMUtils.lang2es_index(segments[0].target_lang)) try: db = self.server[m_index] except: db = self.server.create(m_index) return db.update([self._segment2doc(s) for s in segments])
def get(self, source_id, source_lang, target_lang): tname = TMUtils.es_index2mapdb(TMUtils.lang2es_index(source_lang), TMUtils.lang2es_index(target_lang)) if not tname in self.tables: raise Exception("Language pair : {} - {} doesn't exist".format( source_lang, target_lang)) # TODO: implement bidirectional query t = self.tables[tname] res = self.conn.execute( t.select(t.target_id).where(t.source_id == source_id)) if res: return res.fetchone()[0] return None
def get(self, lang, id): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return hit = self.es.get(index=index, id=id) if not hit: return None return hit['_source']
def _fill_lang(self, tuv, seg): d = dict() d['source_language'] = TMUtils.lang2short(self._get_lang(tuv[0]))#tuv[0].attrib.get('lang')#get('{%s}lang' % self.NS) d['target_language'] = TMUtils.lang2short(self._get_lang(tuv[1])) #tuv[1].attrib.get('lang')#get('{%s}lang' % self.NS) d['source_text'] = self._get_text(seg[0]) if isinstance(d['source_text'], bytes): d['source_text'] = d['source_text'].decode('utf8').encode('utf8') d['target_text'] = self._get_text(seg[1]) if isinstance(d['target_text'], bytes): d['target_text'] = d['target_text'].decode('utf8').encode('utf8') d['source_metadata'] = self._parse_metadata(tuv[0]) d['target_metadata'] = self._parse_metadata(tuv[1]) return d
def __call__(self, index, segments_iter): # Import should be inside the function to avoid serializing all pos tagger dependencies # for parallel execution sys.path.append( os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..')) sys.path = [p for p in sys.path if p] from TMPosTagger.TMPosTagger import TMPosTagger # Cache all segments. Though it might be expensive in terms of memory, but we need # to gather all texts for POS tagger batch and then store back # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently # large number of Spark jobs segments = [s for s in segments_iter] # Initialize PosTaggers for source and target languages pos_taggers = [ TMPosTagger(lang.split('-')[0], universal=self.is_universal) for lang in self.langs ] # Invoke POS taggers for source and target segments src_texts = pos_taggers[0].tag_segments( [XmlUtils.replace_tags(s.source_text) for s in segments]) tgt_texts = pos_taggers[1].tag_segments( [XmlUtils.replace_tags(s.target_text) for s in segments]) # Store POS tags with XML tags as a training data. TODO: make it optional f = open( tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format( TMUtils.date2str(datetime.datetime.now()), index), 'w') iobs = open( tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format( TMUtils.date2str(datetime.datetime.now()), index), 'w') for s, stext, ttext in zip(segments, src_texts, tgt_texts): s.source_pos = self.tags2string(stext) s.target_pos = self.tags2string(ttext) # Write POS tags (+XML tags) to text file to be used as a training data if re.match(XmlUtils.TAG_PATTERN, s.source_text): f.write("{}\n{}\n\n".format( self.tags2string_xml_tags(s.source_text, stext), self.tags2string_xml_tags(s.target_text, ttext))) for s, t in zip( self.tags2string_iob_tags(s.source_text, stext), self.tags2string_iob_tags(s.target_text, ttext)): iobs.write("{}\n{}\n\n".format(s, t)) f.close() iobs.close() return segments
def scan(self, lang, filter = None): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return query = TMDbQuery(es=self.es, index = index, filter=filter) for hit in query.scan(): # Build segment by querying map and target index yield hit
def add_segments(self, segments): bulk = None for segment in segments: if not bulk: m_index = TMUtils.es_index2mapdb( TMUtils.lang2es_index(segment.source_lang), TMUtils.lang2es_index(segment.target_lang)) bulk = self.mongo_db[m_index].initialize_unordered_bulk_op() bulk.find({'source_id': segment.source_id}) \ .update_one({'$set': self._segment2doc(segment) }) try: result = bulk.execute() except BulkWriteError as bwe: result = bwe.details logging.error(bwe.details) return result
def _segment2doc(self, segment): # Initialize/update DB date fields now_str = TMUtils.date2str(datetime.datetime.now()) if not segment.insert_date: segment.insert_date = now_str if not segment.check_date: segment.check_date = TMUtils.date2str(datetime.datetime( 1970, 1, 1)) segment.update_date = now_str return { 'source_id': segment.source_id, 'target_id': segment.target_id, 'source_text': segment.source_text, 'target_text': segment.target_text, 'source_language': segment.source_language, 'target_language': segment.target_language, 'source_metadata': segment.source_metadata, 'target_metadata': segment.target_metadata, 'metadata': segment.metadata, 'tuid': segment.tuid, 'industry': TMUtils.str2list(segment.industry), 'type': TMUtils.str2list(segment.type), 'organization': TMUtils.str2list(segment.organization), 'file_name': TMUtils.str2list(segment.file_name), 'domain': TMUtils.str2list(segment.domain), 'tm_creation_date': segment.tm_creation_date, 'tm_change_date': segment.tm_change_date, 'insert_date': segment.insert_date, 'update_date': segment.update_date, 'check_date': segment.check_date, 'check_version': segment.check_version, 'dirty_score': segment.dirty_score, 'username': segment.username }
def scan_pivot(self, pivot_lang, langs): index = TMUtils.lang2es_index(pivot_lang) if not self.index_exists(index): return search = Search(using=self.es, index=index) for lang in langs: search = search.query('match', target_language=lang) for result in search.scan(): yield result.meta.id
def mget(self, ids_lang): if not ids_lang: return [] body = [{ '_index': TMUtils.lang2es_index(lang), '_id' : id } for lang,id in ids_lang] hits = self.es.mget(body={'docs' : body}) if not hits: return None return [hit.get('_source',None) for hit in hits['docs']]
def add_segment(self, segment, ftype): # Add segment source and target texts to the correspondent index of ElasticSearch id = getattr(segment, ftype + '_id') index = TMUtils.lang2es_index(getattr(segment, ftype + '_language')) s_result = self.es.index(index=index, doc_type=self.DOC_TYPE, id=id, body = self._segment2doc(segment, ftype)) return id
def _segment2table(self, segment, suffix=None): tname = TMUtils.es_index2mapdb( TMUtils.lang2es_index(segment.source_lang), TMUtils.lang2es_index(segment.target_lang)) if suffix: tname += suffix if not tname in self.tables: md = MetaData() self.tables[tname] = Table(tname, md, Column('id', Integer, primary_key=True), Column('source_id', GUID, index=True), Column('target_id', GUID, index=True), Column('creation_date', TIMESTAMP), Column('change_date', TIMESTAMP), mysql_engine='InnoDB', mysql_charset='utf8') md.bind = self.conn self.tables[tname].create(checkfirst=True) return self.tables[tname]
def mquery(self, lang, limit, q_list, filter=None): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return # Query source ES for the text query = TMDbQuery(es=self.es, index=index, q=q_list, filter=filter, limit=limit) for response, q in query(): yield response
def query(self, lang, qstring, filter = None): index = TMUtils.lang2es_index(lang) if not self.index_exists(index): return # Query source ES for the text query = TMDbQuery(es=self.es, index = index, q=qstring, filter=filter) for response,q in query(): for hit in response: yield hit,q
def __call__(self, segment): score = 0 for lang, type in zip(self.langs, ['source', 'target']): if not self.rule_langs or self.rule_langs == lang: det_lang, prob = TMUtils.detect_lang( getattr(segment, type + '_text'), [lang]) # If language probability is lower than the threshold, # penalize segment with the score proportional to probability if prob < self.rule_dict.get('threshold', LangidRule.DEFAULT_THRESHOLD): score += self.rule_dict.get( 'score', self.DEFAULT_SCORE) * (1 / prob) return score
def init_job(self, job_id=None, username=None, type='default', **kwargs): doc = { 'id': job_id, 'type': type, 'username': username, 'status': 'pending', 'submit_time': TMUtils.date2str(datetime.datetime.now()) } if not job_id: job_id = self._allocate_id() # Put params into the doc doc['params'] = kwargs self.update_job(job_id, doc) return id
def output_segment(self, segment): e = ElementTree.Element('tu') e.set('srclang', TMUtils.list2str(segment.source_language)) dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now()) e.set('creationdate', dt) dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now()) e.set('changedate', dt) if segment.tuid: e.set('tuid', str(segment.tuid)) if segment.industry: ElementTree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = segment.industry[0] if segment.type: ElementTree.SubElement(e, 'prop', {'type' : "tda-type"}).text = segment.type[0] if segment.organization: ElementTree.SubElement(e, 'prop', {'type' : "tda-org"}).text = segment.organization[0] ElementTree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default" for t in ['source', 'target']: tuv = ElementTree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))}) ElementTree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text') return e
def _gen_lang_pairs(self, tuv, seg): # Get all languages in the given tu lang_map = dict() for tu, seg in zip(tuv, seg): #lang = TMUtils.lang2short(tu.attrib.get('{%s}lang' % self.NS)) lang = TMUtils.lang2short((self._get_lang(tu)))#(tu.attrib.get('lang')) if not lang in lang_map: lang_map[lang] = [] lang_map[lang].append((tu, seg)) # Generate all requested pairs (note: one tu can contain multiple translations for # the same language for s_lang, t_lang in self.lang_pairs: for s_tuv, s_seg in lang_map.get(s_lang, []): for t_tuv, t_seg in lang_map.get(t_lang, []): yield self._fill_lang((s_tuv, t_tuv), (s_seg, t_seg))
def delete(self, lang, ids): index = TMUtils.lang2es_index(lang) actions = [{'_op_type': 'delete', '_id': id, '_index' : index, '_type': self.DOC_TYPE, } for id in ids] # Bulk delete try: status = helpers.bulk(self.es, actions) except Exception as e: logging.warning(e) return str(e) return status
def _segment2es_bulk(self, segments, ftype, op_type, f_action): # Add segment source and target texts to the correspondent index of ElasticSearch in a batch actions = [] added_ids = set() for segment in segments: id = getattr(segment, ftype + '_id') if id in added_ids: continue # avoid duplicates in the same batch added_ids.add(id) index = TMUtils.lang2es_index(getattr(segment, ftype + '_language')) action = {'_id': id, '_index' : index, '_type' : self.DOC_TYPE, '_op_type': op_type, '_source' : f_action(segment, ftype) #self._segment2doc(segment, ftype) } actions.append(action) # Bulk insert logging.info("Bulk upsert: {}".format(actions)) s_result = helpers.bulk(self.es, actions) self.refresh() # refresh list of indexes (could have been created during insert) return s_result
def _get_index(self, source_lang, target_lang, create_missing=False): m_index = TMUtils.es_index2mapdb(TMUtils.lang2es_index(source_lang), TMUtils.lang2es_index(target_lang)) if self.es.indices.exists(index=m_index): return m_index, False # Try reverse index r_index = TMUtils.es_index2mapdb(TMUtils.lang2es_index(target_lang), TMUtils.lang2es_index(source_lang)) # Found reverse index - use it if self.es.indices.exists(r_index): return r_index, True if not create_missing: return None, None # Neither direct, nor reverse index exist - create a direct one try: self.es.indices.create(m_index) except: pass self.refresh_lang_graph() return m_index, False
def get(self, source_id, source_lang, target_lang): m_index = TMUtils.es_index2mapdb(TMUtils.lang2es_index(source_lang), TMUtils.lang2es_index(target_lang)) doc = self.server[m_index].get(source_id.hex) if doc: return uuid.UUID(doc['target_id']) return None
def add_segment(self, segment): m_index = TMUtils.es_index2mapdb( TMUtils.lang2es_index(segment.source_lang), TMUtils.lang2es_index(segment.target_lang)) return self.server[m_index].update([self._segment2doc(segment)])
def __call__(self, index, segments_iter): for segment in segments_iter: segment.check_date = TMUtils.date2str(datetime.datetime.now()) segment.check_version = self.version yield segment
def get_langs(self): return [TMUtils.es_index2lang(l) for l in self.indexes if re.search('^tm_\w{2}$', l)]