def operate(self, instance): """ given an instance a list of categories as features """ if not self.force and instance.feature_groups.has_key(self.name): return instance.feature_groups[self.name] = {} s = instance.attributes["entity_inferred_name"] tokens = tokenize(normalize_no_lower(s)) instance.feature_groups[ self.name]['GEO_FEAUTURE_geo_inferred_text_has_state'] = Feature( 'GEO_FEAUTURE_geo_inferred_text_has_state', geo_inferred_text_has_state(tokens, self.full, self.full_upper, self.abbr)) instance.feature_groups[ self.name]['GEO_FEAUTURE_geo_inferred_text_has_county'] = Feature( 'GEO_FEAUTURE_geo_inferred_text_has_county', geo_inferred_text_has_county(s)) instance.feature_groups[ self.name]['GEO_FEAUTURE_geo_inferred_text_has_city'] = Feature( 'GEO_FEAUTURE_geo_inferred_text_has_city', geo_inferred_text_has_city(s, self.cities, self.cities_upper)) instance.feature_groups[self.name][ 'GEO_FEAUTURE_geo_inferred_text_ends_with_state'] = Feature( 'GEO_FEAUTURE_geo_inferred_text_ends_with_state', geo_inferred_text_ends_with_state(tokens, self.abbr, self.full)) logging.debug( "Feature count %d for entity id: %d after %s" % (instance.feature_count(), instance.attributes["id"], self.name))
def label_row(self, row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices): instance = self.get_instance_from_row(row, column_indices) X, y, space = pipe.instances_to_matrix( [ instance, ], feature_space=self.feature_space, dense=False) scores = self.model.decision_function(X) fields = [ 'congress', 'chamber', 'document_type', 'number', 'row', 'row_offset', 'row_length', 'score', 'state', 'sponsors' ] cmd = "insert into candidate_earmarks (" + ", ".join( fields ) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id" attributes = instance.attributes state = self.geo_coder.get_state(attributes['entity_text']) cur = self.conn.cursor() if sponsor_indices: print sponsor_indices sponsors = [] for index in sponsor_indices: try: sponsor_cell = attributes['entity_text'].split("|")[index] sponsors_in_cell = string_functions.tokenize( string_functions.normalize_no_lower(sponsor_cell)) for sic in sponsors_in_cell: if sic in self.sponsor_coder.sponsors[congress]: sponsors.append(sic) except Exception as e: print "Index: %d" % index print len(attributes['entity_text'].split("|")) print attributes['entity_text'] logging.exception("SCREW UP") sponsors_string = "|".join(sponsors)[:1024] cur.execute(cmd, (congress, chamber, document_type, number, attributes['entity_text'], row.offset + table_offset, row.length, scores[0], state, sponsors_string)) curr_id = cur.fetchone()[0] for sponsor in sponsors: cur.execute( 'insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)', (curr_id, sponsor)) self.conn.commit()
def operate(self, instance): """ given an instance a list of categories as features """ if not self.force and instance.feature_groups.has_key(self.name): return instance.feature_groups[self.name] = {} tokens = tokenize(normalize_no_lower(instance.attributes["entity_inferred_name"])) for token in tokens: if token not in self.forbidden: feature_name = self.feature_prefix +token.lower() instance.feature_groups[self.name][feature_name] = Feature(feature_name, 1) logging.debug( "Feature count %d for entity id: %d after %s" %(instance.feature_count(),instance.attributes["id"], self.name))
def operate(self, instance): """ given an instance a list of categories as features """ if not self.force and instance.feature_groups.has_key(self.name): return instance.feature_groups[self.name] = {} s = instance.attributes["entity_inferred_name"] tokens = tokenize(normalize_no_lower(s)) instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_has_state'] = Feature('GEO_FEAUTURE_geo_inferred_text_has_state', geo_inferred_text_has_state(tokens, self.full, self.full_upper, self.abbr)) instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_has_county'] = Feature('GEO_FEAUTURE_geo_inferred_text_has_county', geo_inferred_text_has_county(s)) instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_has_city'] = Feature('GEO_FEAUTURE_geo_inferred_text_has_city', geo_inferred_text_has_city(s, self.cities, self.cities_upper)) instance.feature_groups[self.name]['GEO_FEAUTURE_geo_inferred_text_ends_with_state'] = Feature('GEO_FEAUTURE_geo_inferred_text_ends_with_state', geo_inferred_text_ends_with_state(tokens, self.abbr, self.full)) logging.debug( "Feature count %d for entity id: %d after %s" %(instance.feature_count(),instance.attributes["id"], self.name))
def operate(self, instance): """ given an instance a list of categories as features """ if not self.force and instance.feature_groups.has_key(self.name): return instance.feature_groups[self.name] = {} tokens = tokenize( normalize_no_lower(instance.attributes["entity_inferred_name"])) for token in tokens: if token not in self.forbidden: feature_name = self.feature_prefix + token.lower() instance.feature_groups[self.name][feature_name] = Feature( feature_name, 1) logging.debug( "Feature count %d for entity id: %d after %s" % (instance.feature_count(), instance.attributes["id"], self.name))
def label_row(self, row, column_indices, table_offset, congress, chamber, document_type, number, sponsor_indices): instance = self.get_instance_from_row(row, column_indices) X, y, space = pipe.instances_to_matrix([instance,], feature_space = self.feature_space, dense = False) scores = self.model.decision_function(X) fields = ['congress', 'chamber','document_type','number', 'row', 'row_offset', 'row_length', 'score', 'state', 'sponsors'] cmd = "insert into candidate_earmarks (" + ", ".join(fields) + ") values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s) returning id" attributes = instance.attributes state = self.geo_coder.get_state(attributes['entity_text']) cur = self.conn.cursor() if sponsor_indices: print sponsor_indices sponsors = [] for index in sponsor_indices: try: sponsor_cell = attributes['entity_text'].split("|")[index] sponsors_in_cell = string_functions.tokenize(string_functions.normalize_no_lower(sponsor_cell)) for sic in sponsors_in_cell: if sic in self.sponsor_coder.sponsors[congress]: sponsors.append(sic) except Exception as e: print "Index: %d" % index print len(attributes['entity_text'].split("|")) print attributes['entity_text'] logging.exception("SCREW UP") sponsors_string = "|".join(sponsors)[:1024] cur.execute(cmd, (congress, chamber, document_type, number, attributes['entity_text'], row.offset+table_offset, row.length, scores[0], state, sponsors_string)) curr_id = cur.fetchone()[0] for sponsor in sponsors: cur.execute('insert into sponsors (candidate_earmark_id, sponsor) values (%s, %s)', (curr_id,sponsor )) self.conn.commit()