Ejemplo n.º 1
0
  def token_count(self, text, lang):

    lang = lang.split('-')[0].upper()
    if not lang in self.regex:
      try:
        self.regex[lang] = TMRegExpPreprocessor(lang)
        logging.info("Loading Regex for {}".format(lang))
      except Exception as e:
        logging.info("Unsupported Regex for {} ".format(lang))
        self.regex[lang] = lang
    if not lang in self.tokenizers:
        try:
          self.tokenizers[lang] = TMTokenizer(lang)
          logging.info("Loading Tokenizer for {}".format(lang))
        except Exception as e:
          self.tokenizers[lang] = lang
          logging.info("Unsupported Tokenizer for {}".format(lang))

    if self.regex[lang] != lang: text = TMRegexMatch.simplified_name(self.regex[lang].process(text))
    if self.tokenizers[lang] != lang: token_cnt = len((self.tokenizers[lang].tokenizer.process(text)).split(' '))
    else:
      if ' ' in text: token_cnt = len(text.split(' '))
      else: token_cnt = 1

    return token_cnt#len((self.tokenizers[lang].tokenizer.process(TMRegexMatch.simplified_name(self.regex[lang].process(text)))).split(' '))
Ejemplo n.º 2
0
    def transform_apply_regex(self, segment, dic_fuzzy):

        segment = segment[0]
        # Apply regular expression to target
        dic_fuzzy['tm_tgt_tok'] = self.pre_process(segment['target_text'],
                                                   self.src_lang, 'tokenizer')
        dic_fuzzy['tm_tgt_re'] = TMRegExpPreprocessor(self.src_lang).process(
            dic_fuzzy['tm_tgt_tok'])

        dmp = diff_match_patch()
        diffs = dmp.diff_main(dic_fuzzy['src_tok'],
                              dic_fuzzy['tm_src_re'])  # Identified differences
        dmp.diff_cleanupSemantic(diffs)

        find = [tup[1] for tup in diffs if tup[0] == 1]
        replace = [tup[1] for tup in diffs if tup[0] == -1]

        for i in range(0,
                       len(find)):  # Replace the differences in target segment
            if find[i] in dic_fuzzy['tm_tgt_re']:
                segment['target_text'] = dic_fuzzy['tm_tgt_re'].replace(
                    find[i], replace[i], 1)
            else:  # Search by each pattern into the sequence
                segment['target_text'] = self._sub_each_part(
                    find[i], replace[i], dic_fuzzy['tm_tgt_re'])
        #print(segment)
        return segment
Ejemplo n.º 3
0
    def __init__(self, src_lang, tgt_lang):

        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        # Initialize regexp preprocessors
        self.re_pp = dict()
        for lang in [src_lang, tgt_lang]:
            self.re_pp[lang] = TMRegExpPreprocessor(lang)
Ejemplo n.º 4
0
    def apply_regex(self, segment, dic_fuzzy, elastic_ter):

        #Apply regular expression
        dic_fuzzy['tm_src_tok'] = self.pre_process(segment[0]['source_text'],
                                                   self.src_lang, 'tokenizer')
        dic_fuzzy['tm_src_re'] = TMRegExpPreprocessor(self.src_lang).process(
            dic_fuzzy['tm_src_tok'])

        #Check if regular expression was applied
        if dic_fuzzy['tm_src_re'] != dic_fuzzy[
                'tm_src_tok']:  # Was applied regular expression
            improve_ter = self.ter_score(dic_fuzzy['src_re'],
                                         dic_fuzzy['tm_src_re'])
        else:
            improve_ter = 2 * elastic_ter  # Put a gad TER, because regular expression doesn't improve the match

        return dic_fuzzy, improve_ter
Ejemplo n.º 5
0
    def __init__(self, src_lang, tgt_lang):

        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        # Initialize regexp preprocessors
        self.re_pp = dict()
        self.pipe = [
            'formula', 'datetime', 'bullet', 'munit', 'acronym', 'email',
            'url', 'number'
        ]  #
        for lang in [
                src_lang, tgt_lang
        ]:  #'acronym', 'email', 'url', 'datetime', 'formula', 'number'
            self.re_pp[lang] = TMRegExpPreprocessor(lang,
                                                    pipe=[
                                                        'formula', 'datetime',
                                                        'bullet', 'munit',
                                                        'acronym', 'email',
                                                        'url', 'number'
                                                    ])
Ejemplo n.º 6
0
 def pre_process(self, text, lang, preprocess):
     if preprocess == 'tokenizer':
         process_text = TMTokenizer(lang).tokenizer.process(text)
     if preprocess == 'reg_exp':
         process_text = TMRegExpPreprocessor(lang).process(text)
     return process_text
Ejemplo n.º 7
0
class TagsResource(Resource):
    decorators = [PermissionChecker(user_permission)]
    regex_pp = TMRegExpPreprocessor()
    """
  @api {get} /tags/<tag_id> List available tags or get specific tag details
  @apiVersion 1.0.0
  @apiName Get
  @apiGroup Tags
  @apiUse Header
  @apiPermission user

  @apiParam {String} [tag]
  
  @apiError {String} 404 Tag doesn't exist
  @apiError {String} 403 Insufficient permissions
  
  """
    def get(self, tag_id=None):
        tags = []
        if tag_id:
            tag = Tags.query.get(tag_id)
            if tag:
                tags = [tag.to_dict()]
            else:
                abort(404, mesage="Tag {} doesn't exist".format(tag_id))
        else:
            tags = [tag.to_dict() for tag in Tags.query.all()]
        # Filter scopes according to permissions
        tags = UserScopeChecker.filter_domains(tags, key_fn=lambda t: t["id"])
        if tag_id:
            if not tags:
                abort(404, mesage="Tag {} doesn't exist".format(tag_id))
            return tags[0]
        # List of all users
        return {'tags': tags}

    """
  @api {post} /tags/:id Update tag
  @apiVersion 1.0.0
  @apiName Post
  @apiGroup Tags
  @apiUse Header
  @apiPermission admin

  @apiParam {String} id
  @apiParam {String} name
  @apiParam {String} type

  @apiError {String} 403 Insufficient permissions

  """

    @admin_permission.require(http_exception=403)
    def post(self, tag_id):
        args = self._reqparse()
        tag = Tags.query.get(tag_id)

        try:
            if tag:
                tag.update(**args)
                CRUD.update()
            else:
                tag = Tags(tag_id, **args)
                CRUD.add(tag)
        except Exception as e:
            abort(500, message=str(e))
        return {"message": "Tag {} added/updated successfully".format(tag_id)}

    def _reqparse(self):
        parser = RequestParser(bundle_errors=True)
        parser.add_argument(name='name', help="Tag name")
        parser.add_argument(name='type', help="Tag type")

        return parser.parse_args()

    """
  @api {delete} /tags/:id Delete tag
  @apiVersion 1.0.0
  @apiName Delete
  @apiGroup Tags
  @apiUse Header
  @apiPermission admin

  @apiParam {String} tag

  @apiError {String} 403 Insufficient permissions
  @apiError {String} 404 Tag doesn't exist

  """

    @admin_permission.require(http_exception=403)
    def delete(self, tag_id):
        tag = Tags.query.get(tag_id)
        if tag:
            try:
                CRUD.delete(tag)
            except Exception as e:
                abort(500, message=str(e))
        else:
            abort(404, mesage="Tag {} doesn't exist".format(tag_id))
        return {"message": "Tag {} deleted successfully".format(tag_id)}
Ejemplo n.º 8
0
class SettingsResource(Resource):
    decorators = [PermissionChecker(user_permission)]
    regex_pp = TMRegExpPreprocessor()
    """
  @api {get} /settings Get user settings
  @apiVersion 1.0.0
  @apiName Get
  @apiGroup Settings
  @apiUse Header
  @apiPermission user


  """
    def get(self):
        user = Users.query.get(current_identity.id)
        if user:
            out = {'settings': []}
            if user.settings:
                out['settings'] = user.settings[0].regex
            return out
        else:
            abort(500,
                  mesage="User {} doesn't exist".format(current_identity.id))

    """
  @api {put} /settings Update user settings
  @apiVersion 1.0.0
  @apiName Get
  @apiGroup Settings
  @apiUse Header
  @apiPermission user
  """

    def put(self):
        args = self._put_reqparse()
        user = Users.query.get(current_identity.id)
        if not user:
            abort(500,
                  mesage="User {} doesn't exist".format(current_identity.id))

        settings = user.settings
        if settings: settings = settings[0]

        if not settings:
            settings = UserSettings(current_identity.id)
            CRUD.add(settings)
        if args.regex and args.regex.lower() == 'none':
            settings.regex = ''
        else:
            settings.regex = args.regex
            if not self.regex_pp.validate_pipe(settings.regex.split(',')):
                abort(
                    400,
                    mesage=
                    "Invalid regular expression(s). Possible values (joined with comman) are: {} "
                    .format(TMRegExpPreprocessor.regexp.keys()))

        CRUD.update()

        return settings.to_dict()

    def _put_reqparse(self):
        parser = RequestParser(bundle_errors=True)
        parser.add_argument(
            name='regex',
            help=
            "List (separated with comma) of regular expression names to apply",
            required=True)

        return parser.parse_args()