def get_source_target(cand: DataPoint) -> DataPoint: """Returnsthe source and target mentioned in the sentence.""" person_names = [] source = [token.text for token in cand.doc if token.text in sources] target = [token.text for token in cand.doc if token.text in targets] try: cand.source_target = (source[0], target[0]) except: cand.source_target = (np.nan, np.nan) return cand
def get_left_tokens(cand: DataPoint) -> DataPoint: """ Returns tokens in the length 3 window to the left of the person mentions """ # TODO: need to pass window as input params window = 3 end = cand.person1_word_idx[0] cand.person1_left_tokens = cand.tokens[0:end][-1 - window:-1] end = cand.person2_word_idx[0] cand.person2_left_tokens = cand.tokens[0:end][-1 - window:-1] return cand
def get_left_tokens(cand: DataPoint) -> DataPoint: """ Returns tokens in the three length window to the left of entity mentions. :param cand: A candidate DF. :return: Candidate DF with two new columns, each a list of tokens to the left of entities. """ # TODO: make window a parameter window = 3 end = cand.person1_word_idx[0] cand.person1_left_tokens = cand.tokens[0:end][1 - window: -1] end = cand.person2_word_idx[0] cand.person2_left_tokens = cand.tokens[0:end][1 - window: -1] return cand
def get_text_between(cand: DataPoint) -> DataPoint: """ Returns the text between the two person mentions in the sentence """ start = cand.person1_word_idx[1] + 1 end = cand.person2_word_idx[0] cand.text_between = " ".join(cand.tokens[start:end]) return cand
def get_text_between(cand: DataPoint) -> DataPoint: """ Returns the text between two entity mentions. :param cand: A candidate DF. :return: Candidate DF with new column, text between entity mentions. """ start = cand.person1_word_idx[1] + 1 end = cand.person2_word_idx[0] cand.text_between = " ".join(cand.tokens[start:end]) return cand
def get_person_last_names(cand: DataPoint) -> DataPoint: """ Returns the last names for the two person mentions in candidate """ cand = get_person_text(cand) person1_name, person2_name = cand.person_names person1_lastname = (person1_name.split(" ")[-1] if len(person1_name.split(" ")) > 1 else None) person2_lastname = (person2_name.split(" ")[-1] if len(person2_name.split(" ")) > 1 else None) cand.person_lastnames = [person1_lastname, person2_lastname] return cand
def get_person_text(cand: DataPoint) -> DataPoint: """ Returns the text for the two person mentions in candidate """ person_names = [] for index in [1, 2]: field_name = "person{}_word_idx".format(index) start = cand[field_name][0] end = cand[field_name][1] + 1 person_names.append(" ".join(cand["tokens"][start:end])) cand.person_names = person_names return cand
def get_persons_last_name(cand: DataPoint) -> DataPoint: """ Returns entity last names. :param cand: A candidate DF. :return: Candidate DF with a new column, a list of last names. """ cand = get_person_text(cand) person1_name, person2_name = cand.person_names person1_last_name = (person1_name.split(" ")[-1] if len(person1_name.split(" ")) > 1 else None) person2_last_name = (person2_name.split(" ")[-1] if len(person2_name.split(" ")) > 1 else None) cand.person_lastnames = [person1_last_name, person2_last_name] return cand
def get_person_text(cand: DataPoint) -> DataPoint: """ Returns the text for the two person mentions in candidate sentence. :param cand: A candidate DF. :return: Candidate DF with new column, a list of entity names. """ person_names = [] for index in [1, 2]: field_name = "person{j}_word_idx".format(j=index) start = cand[field_name][0] end = cand[field_name][1] + 1 person_names.append(" ".join(cand["tokens"][start:end])) cand.person_names = person_names return cand
def square(x: DataPoint) -> DataPoint: x.num_squared = square_hit_tracker(x.num) if x.num == 21: return None return x
def mapper_pre_2(x: DataPoint) -> DataPoint: x.double_num_squared_plus_1 = x.double_num_squared + 1 return x
def modify_in_place(x: DataPoint) -> DataPoint: x.d["my_key"] = 0 return Row(num=x.num, d=x.d, d_new=x.d)
def square(x: DataPoint) -> DataPoint: fields = x.asDict() fields["num_squared"] = x.num**2 return Row(**fields)
def square(x: DataPoint) -> DataPoint: fields = x.asDict() fields["num_squared"] = square_hit_tracker(x.num) return Row(**fields)
def copy_features(x: DataPoint) -> DataPoint: """Compute x2 + 0.25 for direct comparison to x1.""" x.x3 = x.x2 + 0.25 return x
def get_text_between(cand: DataPoint) -> DataPoint: """ Returns the text between a source-target pair and the text to the left of the source """ source_idx = [token.i for token in cand.doc if token.text in sources] target_idx = [token.i for token in cand.doc if token.text in targets] try: if (len(target_idx) == 1) & (len(source_idx) == 1) & ( source_idx[0] < target_idx[0]): cand.text_between = cand.doc[source_idx[0]:target_idx[0]] cand.text_to_source_left = cand.doc[:source_idx[0]] elif (len(target_idx) > 1) & (len(source_idx) == 1): for target_index in target_idx: if source_idx[0] < target_index: cand.text_between = cand.doc[ source_idx[0]:target_index] cand.text_to_source_left = cand.doc[:source_idx[0]] elif (len(source_idx) > 1) & (len(target_idx) == 1): for source_index in source_idx: if source_index < target_idx[0]: cand.text_between = cand.doc[ source_index:target_idx[0]] cand.text_to_source_left = cand.doc[:source_index] elif (len(source_idx) > 1) & (len(target_idx) > 1): for source_index in source_idx: for target_index in target_idx: if source_index < target_index: cand.text_between = cand.doc[ source_index:target_index] cand.text_to_source_left = cand.doc[:source_index] else: cand.text_between = 'NaN' cand.text_to_source_left = 'NaN' except: cand.text_between = 'NaN' cand.text_to_source_left = 'NaN' return cand
def square(x: DataPoint) -> DataPoint: x.num_squared = x.num ** 2 return x
def square(x: DataPoint) -> DataPoint: x.num_squared = square_hit_tracker(x.num) return x
def mapper_pre(x: DataPoint) -> DataPoint: x.double_num_squared = 2 * x.num_squared return x
def square_returns_none(x: DataPoint) -> DataPoint: if x.num == 2: return None x.num = x.num**2 return x
def modify_in_place(x: DataPoint) -> DataPoint: x.d["my_key"] = 0 x.d_new = x.d return x
def combine_text(x: DataPoint) -> DataPoint: x.text = f"{x.title} {x.article}" return x