Exemple #1
0
def create_features(db_features, text, language, feature, feature_list):
    """Create feature entities and register frequencies.


    """
    if isinstance(text, Entity):
        text = text.id
    db_features = {f.token: f for f in db_features}

    out_features = []
    for f in feature_list:
        if isinstance(f, collections.Sequence) and not isinstance(f, str):
            if len(f) < 1:
                out_features.append([])
                continue
            if f[0][0] == '<':
                continue
            feature_group = []
            for sub_f in f:
                if sub_f in db_features:
                    sub_f = db_features[sub_f]
                    try:
                        sub_f.frequencies[str(text)] += 1
                    except KeyError:
                        sub_f.frequencies[str(text)] = 1
                    feature_group.append(sub_f)
                else:
                    sub_f = Feature(feature=feature,
                                    token=sub_f,
                                    language=language,
                                    index=len(db_features),
                                    frequencies={str(text): 1})
                    db_features[sub_f.token] = sub_f
                    feature_group.append(sub_f)
            out_features.append(feature_group)

        else:
            if f[0] == '<':
                continue
            if f in db_features:
                f = db_features[f]
                try:
                    f.frequencies[str(text)] += 1
                except KeyError:
                    f.frequencies[str(text)] = 1
                out_features.append(f)
            else:
                f = Feature(feature=feature,
                            token=f,
                            language=language,
                            index=len(db_features),
                            frequencies={str(text): 1})
                db_features[f.token] = f
                out_features.append(f)

    return out_features, feature
Exemple #2
0
def register_features(conn, language, feature_type, candidates):
    """Registers features in the database

    Parameters
    ----------
    conn : TessMongoConnection
    language : str
        Language of the Feature entities to insert into database
    feature_type : str
        Type of feature the Feature entities to be inserted into the database
        are supposed to be
    candidates : set[str]
        Instances of a feature type that might need to be added to the database
    """
    already_registered = {
        f.token
        for f in conn.find(
            Feature.collection, language=language, feature=feature_type)
    }
    to_be_registered = [c for c in candidates if c not in already_registered]
    conn.insert([
        Feature(language=language, feature=feature_type, token=c, index=i)
        for c, i in zip(
            to_be_registered,
            range(len(already_registered),
                  len(already_registered) + len(to_be_registered)))
    ])
Exemple #3
0
def _calculate_new_and_for_update_features(text, feature, db_feature_cache,
                                           tokens, form_oid_to_raw_features):
    """Compute new features for insert and old features for update

    In processing the raw features of the given text, feature frequency
    information will be computed.

    Parameters
    ----------
    text : tesserae.db.entities.Text
        Text whose feature frequencies are being analyzed
    feature : str
        Feature type of interest
    db_feature_cache : dict[str, tesserae.db.entities.Feature]
        Mapping between a feature's token and its corresponding Feature entity
    tokens : list of tesserae.db.entities.Token
        All Token entities associated with the text
    form_oid_to_raw_features : dict[ObjectId, list[str]]
        Mapping between form ObjectId and list of strings extracted as raw
        features for that form
    """
    f_token_to_feature_for_insert = {}
    f_token_to_feature_for_update = {}
    text_id_str = str(text.id)
    language = text.language
    for token in tokens:
        form_oid = token.features['form']
        f_tokens = form_oid_to_raw_features[form_oid]
        for f_token in f_tokens:
            if f_token:
                if f_token not in db_feature_cache:
                    if f_token in f_token_to_feature_for_insert:
                        cur_feature = f_token_to_feature_for_insert[f_token]
                        cur_feature.frequencies[text_id_str] += 1
                    else:
                        index = len(db_feature_cache) + \
                            len(f_token_to_feature_for_insert)
                        f_token_to_feature_for_insert[f_token] = Feature(
                            feature=feature,
                            token=f_token,
                            language=language,
                            index=index,
                            frequencies={text_id_str: 1})
                else:
                    if f_token not in f_token_to_feature_for_update:
                        f_token_to_feature_for_update[f_token] = \
                            db_feature_cache[f_token]
                    cur_feature = f_token_to_feature_for_update[f_token]
                    frequencies = cur_feature.frequencies
                    if text_id_str in frequencies:
                        frequencies[text_id_str] += 1
                    else:
                        frequencies[text_id_str] = 1
    return f_token_to_feature_for_insert, f_token_to_feature_for_update
Exemple #4
0
    def tokenize(self, raw, text=None):
        """Normalize and featurize the words in a string.

        Tokens are comprised of the raw string, normalized form, and features
        related to the words under study. This computes all of the relevant
        data and tracks token frequencies in one shot.

        Parameters
        ----------
        raw : str or list of str
            The string(s) to process. If a list, assumes that the string
            members of the list have already been split as intended (e.g.
            list elements were split on whitespace).
        text : tesserae.Text, optional
            Text metadata for associating tokens and frequencies with a
            particular text.

        Returns
        -------
        tokens : list of tesserae.db.Token
            The token entities to insert into the database.
        tags : list of str
            Metadata about the source text for unit bookeeping.
        features list of tesserae.db.Feature
            Features associated with the tokens to be inserted into the
            database.
        """
        # eliminate any lines that don't begin with a tag
        raw = '\n'.join([line for line in raw.split('\n')
            if line.strip().startswith('<') and '>' in line]) + '\n'
        # Compute the normalized forms of the input tokens, splitting the
        # result based on a regex pattern and discarding None values.
        normalized, tags = self.normalize(raw)
        tags = [t[:-1].split()[-1] for t in tags]

        # Compute the display version of each token by stripping the metadata
        # tags and converting newlines to their symbolic form.
        raw = re.sub(r'[<][^>]+[>]\s+', r'', raw, flags=re.UNICODE)
        raw = re.sub(r'/', r' ', raw, flags=re.UNICODE)
        raw = re.sub(r'[\n]', r' / ', raw, flags=re.UNICODE)

        # Split the display form into independent strings for each token,
        # discarding any None values.
        display = re.split(self.split_pattern, raw, flags=re.UNICODE)
        display = [t for t in display if t]

        # Compute the language-specific features of each token and add the
        # normalized forms as additional results.
        featurized = self.featurize(normalized)
        featurized['form'] = normalized

        # Get the text id from the metadata if it was passed in
        try:
            text_id = text
        except AttributeError:
            text_id = None

        # Get the token language from the metadata if it was passed in
        try:
            language = text.language
        except AttributeError:
            language = None

        tokens = []

        # Convert all computed features into entities, discarding duplicates.
        db_features = _get_db_features_by_type(self.connection, language,
                featurized.keys())
        results = [create_features(db_features[ft], text_id, language, ft,
            featurized[ft]) for ft in featurized.keys()]

        for feature_list, feature in results:
            featurized[feature] = feature_list

        # Prep the token objects
        norm_i = 0

        try:
            punctuation = self.connection.find('features', feature='punctuation')[0]
        except IndexError:
            punctuation = Feature(feature='punctuation', token='', index=-1)

        for i, d in enumerate(display):
            if self.word_regex.search(d):
                features = {key: val[norm_i]
                            for key, val in featurized.items()}
                norm_i += 1
            elif re.search(r'^[\d]+$', d, flags=re.UNICODE):
                features = {key: punctuation if key == 'form' else [punctuation]
                            for key in featurized.keys()}
            else:
                features = None

            t = Token(text=text, index=i, display=d, features=features)
            tokens.append(t)

        features = set()
        for val in featurized.values():
            if isinstance(val[0], list):
                for v in val:
                    features.update(v)
            else:
                features.update(val)

        return tokens, tags, list(features)