def output_segment(self, segment): e = etree.Element('tu') e.set('srclang', TMUtils.list2str(segment.source_language)) dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now()) e.set('creationdate', dt) dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now()) e.set('changedate', dt) if segment.tuid: e.set('tuid', str(segment.tuid)) if segment.username: e.set('creationid', segment.username) if segment.industry: etree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = self.list2str(segment.industry) if segment.type: etree.SubElement(e, 'prop', {'type' : "tda-type"}).text = self.list2str(segment.type) if segment.organization: etree.SubElement(e, 'prop', {'type' : "tda-org"}).text = self.list2str(segment.organization) etree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default" if segment.metadata: for prop_type,prop_text in segment.metadata.items(): if not prop_type.startswith('tda-'): # skip already handled props etree.SubElement(e, 'prop', {'type': prop_type}).text = prop_text for t in ['source', 'target']: tuv = etree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))}) if getattr(segment, t + '_pos'): etree.SubElement(tuv, 'prop', {'type': "pos"}).text = getattr(segment, t + '_pos') if getattr(segment, t + '_metadata'): for prop_type, prop_text in getattr(segment, t + '_metadata').items(): etree.SubElement(tuv, 'prop', {'type': prop_type}).text = prop_text etree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text') return e
def _segment2doc(self, segment): # Initialize/update DB date fields now_str = TMUtils.date2str(datetime.datetime.now()) if not segment.insert_date: segment.insert_date = now_str if not segment.check_date: segment.check_date = TMUtils.date2str(datetime.datetime( 1970, 1, 1)) segment.update_date = now_str return { 'source_id': segment.source_id, 'target_id': segment.target_id, 'source_text': segment.source_text, 'target_text': segment.target_text, 'source_language': segment.source_language, 'target_language': segment.target_language, 'source_metadata': segment.source_metadata, 'target_metadata': segment.target_metadata, 'metadata': segment.metadata, 'tuid': segment.tuid, 'industry': TMUtils.str2list(segment.industry), 'type': TMUtils.str2list(segment.type), 'organization': TMUtils.str2list(segment.organization), 'file_name': TMUtils.str2list(segment.file_name), 'domain': TMUtils.str2list(segment.domain), 'tm_creation_date': segment.tm_creation_date, 'tm_change_date': segment.tm_change_date, 'insert_date': segment.insert_date, 'update_date': segment.update_date, 'check_date': segment.check_date, 'check_version': segment.check_version, 'dirty_score': segment.dirty_score, 'username': segment.username }
def generate_pivot(self, sdoc, tdoc): if sdoc['source_id'] != tdoc['source_id']: logging.error( "Invalid pair for pivot generation: sdoc {}, tdoc {}".format( sdoc, tdoc)) assert (sdoc['source_id'] == tdoc['source_id'] ) # make sure pivot exists # Result doc doc = dict() for attr in ['id', 'language', 'text']: doc['source_' + attr] = sdoc['target_' + attr] doc['target_' + attr] = tdoc['target_' + attr] for attr in TMDbQuery.str_attrs: if not attr in sdoc: continue # TODO: should it be union or intersection? doc[attr] = sdoc[attr] + tdoc[attr] if sdoc.get(attr) and tdoc.get( attr) else None for attr in [ 'tm_creation_date', 'tm_change_date', 'insert_date', 'update_date' ]: doc[attr] = TMUtils.date2str(datetime.datetime.now()) doc['check_date'] = TMUtils.date2str(datetime.datetime(1970, 1, 1)) return doc
def __call__(self, index, segments_iter): # Import should be inside the function to avoid serializing all pos tagger dependencies # for parallel execution sys.path.append( os.path.join(os.path.abspath(os.path.dirname(__file__)), '..', '..')) sys.path = [p for p in sys.path if p] from TMPosTagger.TMPosTagger import TMPosTagger # Cache all segments. Though it might be expensive in terms of memory, but we need # to gather all texts for POS tagger batch and then store back # batch of POS-tagged results. Batch should be small enough by splitting to sufficiently # large number of Spark jobs segments = [s for s in segments_iter] # Initialize PosTaggers for source and target languages pos_taggers = [ TMPosTagger(lang.split('-')[0], universal=self.is_universal) for lang in self.langs ] # Invoke POS taggers for source and target segments src_texts = pos_taggers[0].tag_segments( [XmlUtils.replace_tags(s.source_text) for s in segments]) tgt_texts = pos_taggers[1].tag_segments( [XmlUtils.replace_tags(s.target_text) for s in segments]) # Store POS tags with XML tags as a training data. TODO: make it optional f = open( tempfile.gettempdir() + "/pos_tags-{}-{}.txt".format( TMUtils.date2str(datetime.datetime.now()), index), 'w') iobs = open( tempfile.gettempdir() + "/iob_tags-{}-{}.txt".format( TMUtils.date2str(datetime.datetime.now()), index), 'w') for s, stext, ttext in zip(segments, src_texts, tgt_texts): s.source_pos = self.tags2string(stext) s.target_pos = self.tags2string(ttext) # Write POS tags (+XML tags) to text file to be used as a training data if re.match(XmlUtils.TAG_PATTERN, s.source_text): f.write("{}\n{}\n\n".format( self.tags2string_xml_tags(s.source_text, stext), self.tags2string_xml_tags(s.target_text, ttext))) for s, t in zip( self.tags2string_iob_tags(s.source_text, stext), self.tags2string_iob_tags(s.target_text, ttext)): iobs.write("{}\n{}\n\n".format(s, t)) f.close() iobs.close() return segments
def init_job(self, job_id=None, username=None, type='default', **kwargs): doc = { 'id': job_id, 'type': type, 'username': username, 'status': 'pending', 'submit_time': TMUtils.date2str(datetime.datetime.now()) } if not job_id: job_id = self._allocate_id() # Put params into the doc doc['params'] = kwargs self.update_job(job_id, doc) return id
def output_segment(self, segment): e = ElementTree.Element('tu') e.set('srclang', TMUtils.list2str(segment.source_language)) dt = segment.tm_creation_date if segment.tm_creation_date else TMUtils.date2str(datetime.datetime.now()) e.set('creationdate', dt) dt = segment.tm_change_date if segment.tm_change_date else TMUtils.date2str(datetime.datetime.now()) e.set('changedate', dt) if segment.tuid: e.set('tuid', str(segment.tuid)) if segment.industry: ElementTree.SubElement(e, 'prop', {'type' : "tda-industry"}).text = segment.industry[0] if segment.type: ElementTree.SubElement(e, 'prop', {'type' : "tda-type"}).text = segment.type[0] if segment.organization: ElementTree.SubElement(e, 'prop', {'type' : "tda-org"}).text = segment.organization[0] ElementTree.SubElement(e, 'prop', {'type' : "tda-prod"}).text = "Default" for t in ['source', 'target']: tuv = ElementTree.SubElement(e, 'tuv', {'{http://www.w3.org/XML/1998/namespace}lang' : TMUtils.list2str(getattr(segment, t + '_language'))}) ElementTree.SubElement(tuv, 'seg').text = getattr(segment, t + '_text') return e
def __call__(self, index, segments_iter): for segment in segments_iter: segment.check_date = TMUtils.date2str(datetime.datetime.now()) segment.check_version = self.version yield segment
def finalize(self, job_id, status='finished'): doc = self.get_job(job_id) doc['end_time'] = TMUtils.date2str(datetime.datetime.now()) doc['status'] = status self.update_job(job_id, doc)