def venn_intersect_text_tag(df_a, column_name, set_b, column_name_b, list_no): """ Precondition is to have been tokenized per venn diagram illustration of intersect of a set A and set B This function does WILL affect the dataframe that was pass by reference This function will modify the both dataframe to keep words found in both set A and set B, hence intersect""" df_a = sort.reindex(df_a) set_b = sort.reindex(set_b) match = [] for row in df_a[column_name]: for word in row: for rowB in set_b[column_name_b]: for wordB in rowB: if word[:][ list_no] not in match: # if each element in setA cannot be found in the match list if word[:][list_no] in wordB[:][ list_no]: # then we use it to check if its in each element of # SetB (the whole dataframe) match.append( word[:] [list_no]) # if found append into match list if word[:][ list_no] not in match: # if the element cant be find in match # remove element that are not found in match list del word[:] # end of setA for rowB in set_b[column_name_b]: # start removing non match from setB for wordB in rowB: if wordB[:][ list_no] not in match: # if the element cant be find in match # remove element that are not found in match list del wordB[:] return df_a, set_b
def assoc_term_attached(df, column_name, term_struct): """ a filter function to keep only rows in dataframe where it must contains all elements within termStruct, in sequence of the list termStruct. termStruct refers to the user input of which results in a list eg. [ [1st value, text or tag] , [2nd value, text or tag] ] """ df = sort.reindex(df) i = 0 for row in df[column_name]: term_length = len(term_struct) t = 0 for word in df.iloc[i][column_name]: if t >= term_length: # when correct number of matches break if term_struct[t][1] == 'text': list_no = 0 else: list_no = 1 if term_struct[t][0] in word[ list_no]: # if value match remove from list of struct to track t += 1 else: t = 0 # if failed to match start again for tempStruct if t != term_length: df.iloc[i][column_name] = [] i += 1 return df
def venn_union(df_a, df_b): """precondition both dataframe setA and setB must have the same columns this function will join merge 2 dataframe setA and setB as result dataframe and return as a single dataframe """ frames = [df_a, df_b] result = pd.concat(frames) result = sort.reindex( result) # reindex as it will keep the old index from both sets return result
def spacy_clean_cell(df, column_name): """ After working with some other functions, there might be cells in columnName that contains empty list. This function is to clean up such list""" df = sort.reindex(df) i = 0 # for row in df[columnName]: total_row = len(df[column_name]) while i < total_row: for word in df[column_name].loc[i]: df[column_name].loc[i] = [x for x in df[column_name].loc[i] if x] if len(df[column_name].loc[i]) == 0: # double check if work df.drop(i, inplace=True) i += 1 return df