def _segment2doc(self, segment): # Initialize/update DB date fields now_str = TMUtils.date2str(datetime.datetime.now()) if not segment.insert_date: segment.insert_date = now_str if not segment.check_date: segment.check_date = TMUtils.date2str(datetime.datetime( 1970, 1, 1)) segment.update_date = now_str return { 'source_id': segment.source_id, 'target_id': segment.target_id, 'source_text': segment.source_text, 'target_text': segment.target_text, 'source_language': segment.source_language, 'target_language': segment.target_language, 'source_metadata': segment.source_metadata, 'target_metadata': segment.target_metadata, 'metadata': segment.metadata, 'tuid': segment.tuid, 'industry': TMUtils.str2list(segment.industry), 'type': TMUtils.str2list(segment.type), 'organization': TMUtils.str2list(segment.organization), 'file_name': TMUtils.str2list(segment.file_name), 'domain': TMUtils.str2list(segment.domain), 'tm_creation_date': segment.tm_creation_date, 'tm_change_date': segment.tm_change_date, 'insert_date': segment.insert_date, 'update_date': segment.update_date, 'check_date': segment.check_date, 'check_version': segment.check_version, 'dirty_score': segment.dirty_score, 'username': segment.username }
def _segment2doc(self, segment, ftype): text_pos = getattr(segment, ftype + '_pos') doc = {'text': getattr(segment, ftype + '_text')} # Optional fields (POS, tokenized) if hasattr(segment, ftype + '_pos'): doc['pos'] = getattr(segment, ftype + '_pos') op_ftype = 'source' if ftype == 'target' else 'target' # Auxiliary field to facilitate language matrix generation doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])] doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language')) return doc