Beispiel #1
0
    def _segment2doc(self, segment):
        # Initialize/update DB date fields
        now_str = TMUtils.date2str(datetime.datetime.now())
        if not segment.insert_date: segment.insert_date = now_str
        if not segment.check_date:
            segment.check_date = TMUtils.date2str(datetime.datetime(
                1970, 1, 1))
        segment.update_date = now_str

        return {
            'source_id': segment.source_id,
            'target_id': segment.target_id,
            'source_text': segment.source_text,
            'target_text': segment.target_text,
            'source_language': segment.source_language,
            'target_language': segment.target_language,
            'source_metadata': segment.source_metadata,
            'target_metadata': segment.target_metadata,
            'metadata': segment.metadata,
            'tuid': segment.tuid,
            'industry': TMUtils.str2list(segment.industry),
            'type': TMUtils.str2list(segment.type),
            'organization': TMUtils.str2list(segment.organization),
            'file_name': TMUtils.str2list(segment.file_name),
            'domain': TMUtils.str2list(segment.domain),
            'tm_creation_date': segment.tm_creation_date,
            'tm_change_date': segment.tm_change_date,
            'insert_date': segment.insert_date,
            'update_date': segment.update_date,
            'check_date': segment.check_date,
            'check_version': segment.check_version,
            'dirty_score': segment.dirty_score,
            'username': segment.username
        }
Beispiel #2
0
  def _segment2doc(self, segment, ftype):
    text_pos = getattr(segment, ftype + '_pos')
    doc = {'text': getattr(segment, ftype + '_text')}
    # Optional fields (POS, tokenized)
    if hasattr(segment, ftype + '_pos'):
      doc['pos'] = getattr(segment, ftype + '_pos')

    op_ftype = 'source' if ftype == 'target' else 'target'
    # Auxiliary field to facilitate language matrix generation
    doc['target_language'] = [TMUtils.lang2short(TMUtils.str2list(getattr(segment, op_ftype + '_language'))[0])]
    doc['token_cnt'] = self.token_count(getattr(segment, ftype + '_text'), getattr(segment, ftype + '_language'))
    return doc