def populate_ips_from_text_file(self): # Loading the initial file file_content = self.get_text_from_file(file_path=self.file_path) # Instantiating the text parser that grabs the ips text_parser = TextParser(raw_text=file_content) raw_ips = text_parser.list_of_ip_adresses_contained_in_raw_text() # Get env variables for the GeoIpService env_variable_getter = EnvVariableGetter() api_key = env_variable_getter.get_variable("api_key") api_url = env_variable_getter.get_variable("api_url") # Instantiate GeoIp service which is responsible for going out and getting ip geolocation geo_ip_service = GeoIpService(key=api_key, url=api_url) # Get the Geo Ip info using the geo_ip_service, generate ip_models from # the response date and store them in the list and the dict for further filtering for ip in raw_ips: geo_ip_resonse = geo_ip_service.get_geo_ip_info_for_ip( ip_address=ip, format="json") self.generate_and_store_ip_model(ip=ip, data=geo_ip_resonse.json())
from text_parser import TextParser import time path = '/home/tim/Dropbox/Notes/journal.txt' tp = TextParser(path) tp.prepare_batches(128, 32, 4) print len(tp.vocab.keys()) for i in range(10): tp.switch_split('cv') print tp.get_next_feed_dict('a', 'b') tp.switch_split('train') print tp.get_next_feed_dict('a', 'b') tp.switch_split('cv') print tp.get_next_feed_dict('a', 'b') tp.switch_split('train') print tp.get_next_feed_dict('a', 'b') for i in range(1000000): tp.get_next_feed_dict('a', 'b')
def tokenize(text): # print text tokenizer = TextParser(stopword_file='stopwords.txt') # tokens = tokenizer.parse_words(text) tokens = tokenizer.parse_words(text, stem=True) return tokens
def clean_words(self, word_parser): self.words = word_parser.parse_words(self.raw_message) # clean words for one message (NOT efficient) def clean_words_pos(self, word_parser, preserve_tags, ark_run_cmd): self.words = word_parser.parse_words_by_ark_nlp(self.raw_message, \ preserve_types, ark_run_cmd) def set_clean_words(self, clean_words): # self.words = list(set(clean_words)) self.words = clean_words def remove_stopwords(self, stopword_set): trimed_words = [] for w in self.words: if w not in stopword_set: trimed_words.append(w) self.words = trimed_words if __name__ == '__main__': wp = TextParser(min_length=2) m = Message( 'hello, This is@ went octopi just a test for 12you!. Try it http://') preserve_types = ['V', 'N', '^'] ark_run_cmd = 'java -XX:ParallelGCThreads=2 -Xmx2G -jar /Users/chao/Dropbox/code/lib/ark-tweet-nlp-0.3.2.jar' m.clean_words(wp) print m.words m.clean_words_pos(wp, set(['S', 'N', '^']), ark_run_cmd) print m.words