コード例 #1
0
    def populate_ips_from_text_file(self):

        # Loading the initial file
        file_content = self.get_text_from_file(file_path=self.file_path)

        # Instantiating the text parser that grabs the ips
        text_parser = TextParser(raw_text=file_content)
        raw_ips = text_parser.list_of_ip_adresses_contained_in_raw_text()

        # Get env variables for the GeoIpService
        env_variable_getter = EnvVariableGetter()
        api_key = env_variable_getter.get_variable("api_key")
        api_url = env_variable_getter.get_variable("api_url")

        # Instantiate GeoIp service which is responsible for going out and getting ip geolocation
        geo_ip_service = GeoIpService(key=api_key, url=api_url)

        # Get the Geo Ip info using the geo_ip_service, generate ip_models from
        # the response date and store them in the list and the dict for further filtering
        for ip in raw_ips:
            geo_ip_resonse = geo_ip_service.get_geo_ip_info_for_ip(
                ip_address=ip, format="json")
            self.generate_and_store_ip_model(ip=ip, data=geo_ip_resonse.json())
コード例 #2
0
from text_parser import TextParser
import time

path = '/home/tim/Dropbox/Notes/journal.txt'

tp = TextParser(path)
tp.prepare_batches(128, 32, 4)

print len(tp.vocab.keys())

for i in range(10):
    tp.switch_split('cv')
    print tp.get_next_feed_dict('a', 'b')
    tp.switch_split('train')
    print tp.get_next_feed_dict('a', 'b')
    tp.switch_split('cv')
    print tp.get_next_feed_dict('a', 'b')
    tp.switch_split('train')
    print tp.get_next_feed_dict('a', 'b')

for i in range(1000000):
    tp.get_next_feed_dict('a', 'b')
コード例 #3
0
def tokenize(text):
    # print text
    tokenizer = TextParser(stopword_file='stopwords.txt')
    # tokens = tokenizer.parse_words(text)
    tokens = tokenizer.parse_words(text, stem=True)
    return tokens
コード例 #4
0
    def clean_words(self, word_parser):
        self.words = word_parser.parse_words(self.raw_message)

    # clean words for one message (NOT efficient)
    def clean_words_pos(self, word_parser, preserve_tags, ark_run_cmd):
        self.words = word_parser.parse_words_by_ark_nlp(self.raw_message, \
                                                preserve_types, ark_run_cmd)

    def set_clean_words(self, clean_words):
        # self.words = list(set(clean_words))
        self.words = clean_words

    def remove_stopwords(self, stopword_set):
        trimed_words = []
        for w in self.words:
            if w not in stopword_set:
                trimed_words.append(w)
        self.words = trimed_words


if __name__ == '__main__':
    wp = TextParser(min_length=2)
    m = Message(
        'hello, This is@ went octopi just a test for 12you!. Try it http://')
    preserve_types = ['V', 'N', '^']
    ark_run_cmd = 'java -XX:ParallelGCThreads=2 -Xmx2G -jar /Users/chao/Dropbox/code/lib/ark-tweet-nlp-0.3.2.jar'
    m.clean_words(wp)
    print m.words
    m.clean_words_pos(wp, set(['S', 'N', '^']), ark_run_cmd)
    print m.words