def get_labeled_docs(gold_csv="gold.csv"): path = "data/" + gold_csv print('File:', path) csv_reader = csv.DictReader(open(path, 'rU', encoding="latin-1"), delimiter=',') labeled_docs = [] for line in csv_reader: #print('Line:', line) instance_id = line['project_id'] text = line['tweet_text'] # TODO: save the Date field label = line['label'] if label in Config.code_book: doc = TextInstance(instance_id, text) doc.label = label #creating the text line that will be passed through tokenize(docs) in Tokenizer.py doc.text = text labeled_docs.append(doc) else: print("post: " + instance_id + " label: " + label) # adds tokens and pos tags labeled_docs = tokenize(labeled_docs) return labeled_docs
def clean_text(text): # NLP Pre-Processing tokenize_words = tokenize(text.lower()) stop_words_removed = remove_stop_words(tokenize_words) relevant_words = [ WordNetLemmatizer().lemmatize(words, pos='v') for words in stop_words_removed if len(words) > 3 ] return relevant_words
def __init__(self, path): with open(path) as f: text = f.read() self.tokens = tokenize(text) self.count = 0 self.ClassScope = SymbolTable() self.SubroutineScope = SymbolTable() self.label_count = {"if": 0, "while": 0} self.vmcode = ""
def check_filter(self, message: str) -> bool: """Returns True if message contains a banned word. Args: message (str): The message to check. """ for word in tokenize(message): if word.lower() in self.blacklist: return True return False
def analyze(root_dir): if (len(root_dir) == 0): print("No input file name") exit() tokenized_data, file_names = tokenize(root_dir) if (len(tokenized_data) != len(file_names)): raise Exception("XML data and file names count mismatch") parsed_files = [] for i in range(len(tokenized_data)): # print("\n\n\nParsing file #" + str(i + 1) + " (" + file_names[i] + ")") parsed_files.append(parse(tokenized_data[i])) return parsed_files, file_names
def find_IOBs(doc): IOB_output = [] # tknzr = TweetTokenizer() # tokenize doc.text from docs tokens = tokenize(doc) #print('THIS IS TOKENS: ', tokens) # go over tokens, generating IOB and omitting [ ] # status is out, begin or in status = 'out' for token in tokens: #print(token) if status == 'out' and token != '[': IOB_output.append('O') continue if status == 'out' and token == '[': status = 'begin' continue if status == 'begin' and token != '/': IOB_output.append('B') status = 'in' continue if status == 'begin' and token == '/': status = 'cue' continue if status == 'cue' and token != '~': IOB_output.append('C') continue if status == 'cue' and token == '~': status = 'in' continue if status == 'in' and token != ']' and token != '/': IOB_output.append('I') continue if status == 'in' and token != ']' and token == '/': status = 'cue' continue if status == 'in' and token == ']': status = 'out' return IOB_output
def constructInvertedIndex(): global dictionary dictionary = BTree(Node("سسسسسس", 1, [])) nodesList = [] docCounter = 0 for news in getNewsList(): nodes = {} position = 0 for term in tokenize(normalize(news.content), check_finglish): if term != invalidToken: nodes[dictionary.addOccurrence(term, news.id, position)] = True position += 1 nodesList.append(nodes) for node in nodes: node.cal_tf(news.id) docCounter += 1 if docCounter % 20 == 0: Laws.heap(getDictionary()) calAllIdf(dictionary.root) i = 0 for news in getNewsList(): # calculate the documents' normalize factors for 3 scoring schemes nodes = nodesList[i] sum_of_squares_1 = 0 sum_of_squares_2 = 0 sum_of_squares_3 = 0 for node in nodes.keys(): sum_of_squares_1 += math.pow((getTf(news.id, node.postingsList) - 1) * node.idf, 2) sum_of_squares_2 += math.pow(getTf(news.id, node.postingsList), 2) sum_of_squares_3 += math.pow(getTf(news.id, node.postingsList) * node.idf, 2) normalizationFactorsScheme1.append(math.sqrt(sum_of_squares_1)) normalizationFactorsScheme2.append(math.sqrt(sum_of_squares_2)) normalizationFactorsScheme3.append(math.sqrt(sum_of_squares_3)) i += 1 Laws.storeHeapDataSet() storeDictionary(dictionary) storeNormFactors()
def message_handler(self, m: Message): try: if m.type == "366": logger.info(f"Successfully joined channel: #{m.channel}") # Get the list of mods used for modifying the blacklist logger.info("Fetching mod list...") self.ws.send_message("/mods") elif m.type == "NOTICE": # Check whether the NOTICE is a response to our /mods request if m.message.startswith("The moderators of this channel are:"): string_list = m.message.replace( "The moderators of this channel are:", "").strip() self.mod_list = [m.channel] + string_list.split(", ") logger.info( f"Fetched mod list. Found {len(self.mod_list) - 1} mods." ) elif m.message == "There are no moderators of this channel.": self.mod_list = [m.channel] logger.info(f"Fetched mod list. Found no mods.") # If it is not, log this NOTICE else: logger.info(m.message) elif m.type in ("PRIVMSG", "WHISPER"): if m.message.startswith( "!enable") and self.check_if_permissions(m): if self._enabled: self.ws.send_whisper( m.user, "The generate command is already enabled.") else: self.ws.send_whisper( m.user, "Users can now use generate command again.") self._enabled = True logger.info( "Users can now use generate command again.") elif m.message.startswith( "!disable") and self.check_if_permissions(m): if self._enabled: self.ws.send_whisper( m.user, "Users can now no longer use generate command.") self._enabled = False logger.info( "Users can now no longer use generate command.") else: self.ws.send_whisper( m.user, "The generate command is already disabled.") elif m.message.startswith( ("!setcooldown", "!setcd")) and self.check_if_permissions(m): split_message = m.message.split(" ") if len(split_message) == 2: try: cooldown = int(split_message[1]) except ValueError: self.ws.send_whisper( m.user, f"The parameter must be an integer amount, eg: !setcd 30" ) return self.cooldown = cooldown Settings.update_cooldown(cooldown) self.ws.send_whisper( m.user, f"The !generate cooldown has been set to {cooldown} seconds." ) else: self.ws.send_whisper( m.user, f"Please add exactly 1 integer parameter, eg: !setcd 30." ) if m.type == "PRIVMSG": # Ignore bot messages if m.user.lower() in self.denied_users: return if self.check_if_generate(m.message): if not self.enable_generate_command and not self.check_if_permissions( m): return if not self._enabled: if not self.db.check_whisper_ignore(m.user): self.send_whisper( m.user, "The !generate has been turned off. !nopm to stop me from whispering you." ) return cur_time = time.time() if self.prev_message_t + self.cooldown < cur_time or self.check_if_permissions( m): if self.check_filter(m.message): sentence = "You can't make me say that, you madman!" else: params = tokenize( m.message )[2:] if self.allow_generate_params else None # Generate an actual sentence sentence, success = self.generate(params) if success: # Reset cooldown if a message was actually generated self.prev_message_t = time.time() logger.info(sentence) self.ws.send_message(sentence) else: if not self.db.check_whisper_ignore(m.user): self.send_whisper( m.user, f"Cooldown hit: {self.prev_message_t + self.cooldown - cur_time:0.2f} out of {self.cooldown:.0f}s remaining. !nopm to stop these cooldown pm's." ) logger.info( f"Cooldown hit with {self.prev_message_t + self.cooldown - cur_time:0.2f}s remaining." ) return # Send help message when requested. elif m.message.startswith( ("!ghelp", "!genhelp", "!generatehelp")): self.send_help_message() # Ignore the message if it is deemed a command elif self.check_if_other_command(m.message): return # Ignore the message if it contains a link. elif self.check_link(m.message): return if "emotes" in m.tags: # If the list of emotes contains "emotesv2_", then the message contains a bit emote, # and we choose not to learn from those messages. if "emotesv2_" in m.tags["emotes"]: return # Replace modified emotes with normal versions, # as the bot will never have the modified emotes unlocked at the time. for modifier in self.extract_modifiers(m.tags["emotes"]): m.message = m.message.replace(modifier, "") # Ignore the message if any word in the sentence is on the ban filter if self.check_filter(m.message): logger.warning( f"Sentence contained blacklisted word or phrase:\"{m.message}\"" ) return else: # Try to split up sentences. Requires nltk's 'punkt' resource try: sentences = sent_tokenize(m.message.strip()) # If 'punkt' is not downloaded, then download it, and retry except LookupError: logger.debug("Downloading required punkt resource...") import nltk nltk.download('punkt') logger.debug("Downloaded required punkt resource.") sentences = sent_tokenize(m.message.strip()) for sentence in sentences: # Get all seperate words words = sentence.split(" ") # Double spaces will lead to invalid rules. We remove empty words here if "" in words: words = [word for word in words if word] # If the sentence is too short, ignore it and move on to the next. if len(words) <= self.key_length: continue # Add a new starting point for a sentence to the <START> #self.db.add_rule(["<START>"] + [words[x] for x in range(self.key_length)]) self.db.add_start_queue( [words[x] for x in range(self.key_length)]) # Create Key variable which will be used as a key in the Dictionary for the grammar key = list() for word in words: # Set up key for first use if len(key) < self.key_length: key.append(word) continue self.db.add_rule_queue(key + [word]) # Remove the first word, and add the current word, # so that the key is correct for the next word. key.pop(0) key.append(word) # Add <END> at the end of the sentence self.db.add_rule_queue(key + ["<END>"]) elif m.type == "WHISPER": # Allow people to whisper the bot to disable or enable whispers. if m.message == "!nopm": logger.debug(f"Adding {m.user} to Do Not Whisper.") self.db.add_whisper_ignore(m.user) self.ws.send_whisper( m.user, "You will no longer be sent whispers. Type !yespm to reenable. " ) elif m.message == "!yespm": logger.debug(f"Removing {m.user} from Do Not Whisper.") self.db.remove_whisper_ignore(m.user) self.ws.send_whisper( m.user, "You will again be sent whispers. Type !nopm to disable again. " ) # Note that I add my own username to this list to allow me to manage the # blacklist in channels of my bot in channels I am not modded in. # I may modify this and add a "allowed users" field in the settings file. elif m.user.lower() in self.mod_list + ["cubiedev" ] + self.allowed_users: # Adding to the blacklist if self.check_if_our_command(m.message, "!blacklist"): if len(m.message.split()) == 2: # TODO: Remove newly blacklisted word from the Database word = m.message.split()[1].lower() self.blacklist.append(word) logger.info(f"Added `{word}` to Blacklist.") self.write_blacklist(self.blacklist) self.ws.send_whisper(m.user, "Added word to Blacklist.") else: self.ws.send_whisper( m.user, "Expected Format: `!blacklist word` to add `word` to the blacklist" ) # Removing from the blacklist elif self.check_if_our_command(m.message, "!whitelist"): if len(m.message.split()) == 2: word = m.message.split()[1].lower() try: self.blacklist.remove(word) logger.info( f"Removed `{word}` from Blacklist.") self.write_blacklist(self.blacklist) self.ws.send_whisper( m.user, "Removed word from Blacklist.") except ValueError: self.ws.send_whisper( m.user, "Word was already not in the blacklist.") else: self.ws.send_whisper( m.user, "Expected Format: `!whitelist word` to remove `word` from the blacklist." ) # Checking whether a word is in the blacklist elif self.check_if_our_command(m.message, "!check"): if len(m.message.split()) == 2: word = m.message.split()[1].lower() if word in self.blacklist: self.ws.send_whisper( m.user, "This word is in the Blacklist.") else: self.ws.send_whisper( m.user, "This word is not in the Blacklist.") else: self.ws.send_whisper( m.user, "Expected Format: `!check word` to check whether `word` is on the blacklist." ) elif m.type == "CLEARMSG": # If a message is deleted, its contents will be unlearned # or rather, the "occurances" attribute of each combinations of words in the sentence # is reduced by 5, and deleted if the occurances is now less than 1. self.db.unlearn(m.message) # TODO: Think of some efficient way to check whether it was our message that got deleted. # If the bot's message was deleted, log this as an error #if m.user.lower() == self.nick.lower(): # logger.error(f"This bot message was deleted: \"{m.message}\"") except Exception as e: logger.exception(e)
def user_story_processing(user_story): existing_comparison_technique = ['cosine', 'euclidean', 'manhattan'] # NLP Pre-Processing tokenize_words = tokenize(user_story) corrected_words = spell_checker(tokenize_words) stop_words_removed = remove_stop_words(corrected_words) hypothesis_synonyms_values = synonyms_words(stop_words_removed) lda_output = lda_supervised_topic_modelling(stop_words_removed) # Insights from Database server_connection = database_processing.mysql_connection( 'root', 'root', 'localhost') databases_present = database_processing.database_information( server_connection) number_of_values = 1 database_finalisation_list = [] vectorized_words = word_embedding_tfidf(databases_present, hypothesis_synonyms_values) for comparison_technique in existing_comparison_technique: # Finding the Database to be referred if comparison_technique == "euclidean": extracted_database_finalised = euclidean_distance( databases_present, vectorized_words, number_of_values) database_finalisation_list.append(extracted_database_finalised) elif comparison_technique == "cosine": extracted_database_finalised = cosine_similarity( databases_present, vectorized_words, number_of_values) database_finalisation_list.append(extracted_database_finalised) elif comparison_technique == "manhattan": extracted_database_finalised = manhattan_distance( databases_present, vectorized_words, number_of_values) database_finalisation_list.append(extracted_database_finalised) database_finalised_value = processing_array_generated( database_finalisation_list, number_of_values) database_finalised = database_finalised_value[0] while (True): user_decision = input( "Database Predicted by System is " + database_finalised.upper() + ".\nIs the prediction Correct?\nYes - If Prediction is Correct\nNo - If Prediction is Wrong\nNA - Not Aware of Database\nq - To go Back : " ) if user_decision == "Yes": break elif user_decision == "No": print("Following are the list of Database Present:") count = 1 for x in range(0, len(databases_present)): print(str(count) + " " + databases_present[x].upper()) count = count + 1 database_finalised = input( "Enter the Correct Database Name: ").lower() break elif user_decision == "NA": print( "All Databases present in the Database Connection will be Considered" ) database_finalised = " " break elif user_decision == "q": return else: print("Kindly insert input in Yes or No") database_metadata_information = [] database_value = [] table_information = [] fields = [] field_datatype = [] field_comments = [] if database_finalised == " ": for x in range(0, len(databases_present)): database_metadata_info, database_val, table_info, field_info, field_datatype_info, field_comments_info = database_processing.database_metadata_information( server_connection, databases_present[x]) database_metadata_information.extend(database_metadata_info) database_value.extend(database_val) table_information.extend(table_info) fields.extend(field_info) field_datatype_info.extend(field_datatype) field_comments.extend(field_comments_info) else: database_metadata_information, database_value, table_information, fields, field_datatype, field_comments = database_processing.database_metadata_information( server_connection, database_finalised) updated_fields_complete = [] for field in fields: field = re.sub('[^0-9a-zA-Z]+', ' ', field) updated_fields_complete.append(field) updated_fields = pd.unique(updated_fields_complete).tolist() field_comments = pd.unique(field_comments).tolist() # Advance NLP Processing #relevant_words = [words for words in stop_words_removed if len(words) > 3] pos_tagged_words = part_of_speech_tagging(stop_words_removed) synonyms_values = synonyms_words(pos_tagged_words) if (len(updated_fields) <= pos_tagged_words.size): number_of_values = len(updated_fields) else: number_of_values = pos_tagged_words.size # Field Value Processing relevant_columns_based_on_comments = [] relevant_columns_based_on_fields = [] column_predicted_list = [] if len(updated_fields): vectorized_field_words = word_embedding_tfidf(updated_fields, synonyms_values) for comparison_technique in existing_comparison_technique: # Finding the Database to be referred if comparison_technique == "euclidean": relevant_columns_based_on_fields = euclidean_distance( updated_fields, vectorized_field_words, number_of_values) elif comparison_technique == "cosine": relevant_columns_based_on_fields = cosine_similarity( updated_fields, vectorized_field_words, number_of_values) elif comparison_technique == "manhattan": relevant_columns_based_on_fields = manhattan_distance( updated_fields, vectorized_field_words, number_of_values) column_predicted_list.extend(relevant_columns_based_on_fields) if (len(field_comments) and len(updated_fields) == len(field_comments)): vectorized_comment_words = word_embedding_tfidf( field_comments, synonyms_values) for comparison_technique in existing_comparison_technique: # Finding the Database to be referred if comparison_technique == "euclidean": relevant_columns_based_on_comments = euclidean_distance( field_comments, vectorized_comment_words, number_of_values) elif comparison_technique == "cosine": relevant_columns_based_on_comments = cosine_similarity( field_comments, vectorized_comment_words, number_of_values) elif comparison_technique == "manhattan": relevant_columns_based_on_comments = manhattan_distance( field_comments, vectorized_comment_words, number_of_values) relevant_fields_based_on_comments = [] for comments in relevant_columns_based_on_comments: relevant_fields_based_on_comments.append( updated_fields[field_comments.index(comments)]) column_predicted_list.extend(relevant_fields_based_on_comments) number_of_values = len(list(set(column_predicted_list))) column_finalised = processing_array_generated(column_predicted_list, number_of_values) field_finalised = [] for field_value in column_finalised: field_finalised.append( fields[updated_fields_complete.index(field_value)]) finalised_database = [] finalised_table = [] for field in field_finalised: indices = [i for i, x in enumerate(fields) if x == field] field_database = [] field_table = [] index = 0 for z in indices: field_database.insert(index, database_value[z].upper()) field_table.insert(index, table_information[z].upper()) index = index + 1 field_database = pd.unique(field_database).tolist() field_table = pd.unique(field_table).tolist() finalised_database.append(field_database) finalised_table.append(field_table) print('**** After NLP Processing ****') result_display(field_finalised, finalised_table, finalised_database) print('**** After Feature Selection ****') field_finalised, finalised_table, finalised_database, feature_list, logs, feature_encoded = feature_selection_processing( field_finalised, finalised_table, finalised_database, server_connection) print('**** Logs ****') for x in range(len(logs)): print(logs[x]) result_display(field_finalised, finalised_table, finalised_database) if (lda_output[0] != " ") and (len(field_finalised) != 0): print('**** Probable Algorithms ****') algorithm_used, accuracy_score, target_feature, independent_features, message = algorithm_selection_processing( feature_list, lda_output, feature_encoded) if message == " ": table = PrettyTable([ 'Preferences', 'Algorithm Prefered', 'Accuracy Percentage', 'Target Feature (Field Name__Table Name__Database Name)', 'Independent Features' ]) index = 1 for i in range(len(algorithm_used)): table.add_row([ index, algorithm_used[index - 1], accuracy_score[index - 1], target_feature[index - 1], independent_features[index - 1] ]) index = index + 1 print(table) else: print(message)
@staticmethod def cosine_similarity(text1, text2): """ :param text1, text2: list of tokens :return: float """ try: tfidf = TfidfVectorizer().fit_transform( map(_SimilarityUtil._inverse, [text1, text2])) return 1 - cosine(tfidf[0].todense(), tfidf[1].todense()) except ValueError: # Possible containing only stopwords return 0 if __name__ == '__main__': from Tokenizer import tokenize, tokenize_with_preprocess token1 = tokenize('I l\'ove reading') token2 = tokenize('I really love look book') print _SimilarityUtil.jaccard_similarity(token1, token2) token1_pre = tokenize_with_preprocess('I l\'ove reading') token2_pre = tokenize_with_preprocess('I really love look book') print _SimilarityUtil.jaccard_similarity(token1_pre, token2_pre) print _SimilarityUtil.cosine_similarity(token1_pre, token2_pre) print _SimilarityUtil.cosine_similarity(['love', 'apple'], ['love', 'apple']) print _SimilarityUtil.cosine_similarity(['a'], ['a'])
#self.xml += "<subroutineCall>\n" if self.tokens[self.count + 1]["token"] == ".": for i in range(3): self.writeToken() # (className|varName) . subroutineName else: self.writeToken() # subroutineName self.writeToken() # ( self.compileExpressionList() self.writeToken() # ) #self.xml += "</subroutineCall>\n" def compileExpressionList(self): self.xml += "<expressionList>\n" if self.tokens[self.count]["token"] == ")": pass else: self.compileExpression() while self.tokens[self.count]["token"] == ",": self.writeToken() # , self.compileExpression() self.xml += "</expressionList>\n" if __name__ == "__main__": path = sys.argv[1] with open(path) as f: text = f.read() tokens = tokenize(text) parser = ParserXML(tokens) parser.writeXML(path[:-5] + ".xml")
def parse(line): tokens = tokenize(line) return parseLine(tokens)
def parse(program): return read_from_tokens(tokenize(program))