def create_features(db_features, text, language, feature, feature_list): """Create feature entities and register frequencies. """ if isinstance(text, Entity): text = text.id db_features = {f.token: f for f in db_features} out_features = [] for f in feature_list: if isinstance(f, collections.Sequence) and not isinstance(f, str): if len(f) < 1: out_features.append([]) continue if f[0][0] == '<': continue feature_group = [] for sub_f in f: if sub_f in db_features: sub_f = db_features[sub_f] try: sub_f.frequencies[str(text)] += 1 except KeyError: sub_f.frequencies[str(text)] = 1 feature_group.append(sub_f) else: sub_f = Feature(feature=feature, token=sub_f, language=language, index=len(db_features), frequencies={str(text): 1}) db_features[sub_f.token] = sub_f feature_group.append(sub_f) out_features.append(feature_group) else: if f[0] == '<': continue if f in db_features: f = db_features[f] try: f.frequencies[str(text)] += 1 except KeyError: f.frequencies[str(text)] = 1 out_features.append(f) else: f = Feature(feature=feature, token=f, language=language, index=len(db_features), frequencies={str(text): 1}) db_features[f.token] = f out_features.append(f) return out_features, feature
def register_features(conn, language, feature_type, candidates): """Registers features in the database Parameters ---------- conn : TessMongoConnection language : str Language of the Feature entities to insert into database feature_type : str Type of feature the Feature entities to be inserted into the database are supposed to be candidates : set[str] Instances of a feature type that might need to be added to the database """ already_registered = { f.token for f in conn.find( Feature.collection, language=language, feature=feature_type) } to_be_registered = [c for c in candidates if c not in already_registered] conn.insert([ Feature(language=language, feature=feature_type, token=c, index=i) for c, i in zip( to_be_registered, range(len(already_registered), len(already_registered) + len(to_be_registered))) ])
def _calculate_new_and_for_update_features(text, feature, db_feature_cache, tokens, form_oid_to_raw_features): """Compute new features for insert and old features for update In processing the raw features of the given text, feature frequency information will be computed. Parameters ---------- text : tesserae.db.entities.Text Text whose feature frequencies are being analyzed feature : str Feature type of interest db_feature_cache : dict[str, tesserae.db.entities.Feature] Mapping between a feature's token and its corresponding Feature entity tokens : list of tesserae.db.entities.Token All Token entities associated with the text form_oid_to_raw_features : dict[ObjectId, list[str]] Mapping between form ObjectId and list of strings extracted as raw features for that form """ f_token_to_feature_for_insert = {} f_token_to_feature_for_update = {} text_id_str = str(text.id) language = text.language for token in tokens: form_oid = token.features['form'] f_tokens = form_oid_to_raw_features[form_oid] for f_token in f_tokens: if f_token: if f_token not in db_feature_cache: if f_token in f_token_to_feature_for_insert: cur_feature = f_token_to_feature_for_insert[f_token] cur_feature.frequencies[text_id_str] += 1 else: index = len(db_feature_cache) + \ len(f_token_to_feature_for_insert) f_token_to_feature_for_insert[f_token] = Feature( feature=feature, token=f_token, language=language, index=index, frequencies={text_id_str: 1}) else: if f_token not in f_token_to_feature_for_update: f_token_to_feature_for_update[f_token] = \ db_feature_cache[f_token] cur_feature = f_token_to_feature_for_update[f_token] frequencies = cur_feature.frequencies if text_id_str in frequencies: frequencies[text_id_str] += 1 else: frequencies[text_id_str] = 1 return f_token_to_feature_for_insert, f_token_to_feature_for_update
def tokenize(self, raw, text=None): """Normalize and featurize the words in a string. Tokens are comprised of the raw string, normalized form, and features related to the words under study. This computes all of the relevant data and tracks token frequencies in one shot. Parameters ---------- raw : str or list of str The string(s) to process. If a list, assumes that the string members of the list have already been split as intended (e.g. list elements were split on whitespace). text : tesserae.Text, optional Text metadata for associating tokens and frequencies with a particular text. Returns ------- tokens : list of tesserae.db.Token The token entities to insert into the database. tags : list of str Metadata about the source text for unit bookeeping. features list of tesserae.db.Feature Features associated with the tokens to be inserted into the database. """ # eliminate any lines that don't begin with a tag raw = '\n'.join([line for line in raw.split('\n') if line.strip().startswith('<') and '>' in line]) + '\n' # Compute the normalized forms of the input tokens, splitting the # result based on a regex pattern and discarding None values. normalized, tags = self.normalize(raw) tags = [t[:-1].split()[-1] for t in tags] # Compute the display version of each token by stripping the metadata # tags and converting newlines to their symbolic form. raw = re.sub(r'[<][^>]+[>]\s+', r'', raw, flags=re.UNICODE) raw = re.sub(r'/', r' ', raw, flags=re.UNICODE) raw = re.sub(r'[\n]', r' / ', raw, flags=re.UNICODE) # Split the display form into independent strings for each token, # discarding any None values. display = re.split(self.split_pattern, raw, flags=re.UNICODE) display = [t for t in display if t] # Compute the language-specific features of each token and add the # normalized forms as additional results. featurized = self.featurize(normalized) featurized['form'] = normalized # Get the text id from the metadata if it was passed in try: text_id = text except AttributeError: text_id = None # Get the token language from the metadata if it was passed in try: language = text.language except AttributeError: language = None tokens = [] # Convert all computed features into entities, discarding duplicates. db_features = _get_db_features_by_type(self.connection, language, featurized.keys()) results = [create_features(db_features[ft], text_id, language, ft, featurized[ft]) for ft in featurized.keys()] for feature_list, feature in results: featurized[feature] = feature_list # Prep the token objects norm_i = 0 try: punctuation = self.connection.find('features', feature='punctuation')[0] except IndexError: punctuation = Feature(feature='punctuation', token='', index=-1) for i, d in enumerate(display): if self.word_regex.search(d): features = {key: val[norm_i] for key, val in featurized.items()} norm_i += 1 elif re.search(r'^[\d]+$', d, flags=re.UNICODE): features = {key: punctuation if key == 'form' else [punctuation] for key in featurized.keys()} else: features = None t = Token(text=text, index=i, display=d, features=features) tokens.append(t) features = set() for val in featurized.values(): if isinstance(val[0], list): for v in val: features.update(v) else: features.update(val) return tokens, tags, list(features)