Beispiel #1
0
def create_template_corpus(lang='de'):
    """Creates a template corpus where for each news url, the headline css
    selector and wayback_url of the news is stored.

    # Arguments:
        lang: language of the corpus
    """
    redis_client = RedisClient(lang=lang)
    base_url = get_base_url(lang=lang)
    while True:
        story_urls = get_story_urls(base_url)
        for url in story_urls:
            story = parse_qs(urlparse(url).query, keep_blank_values=True)['q']
            story = unicode(story[0])
            logging.info('Processing story "{}"'.
                         format((story.decode('utf-8'))))
            for news in build_news(url):
                if news:
                    news = append_html(news, redis_client)
                    news = append_headline_selector(news)
                    if is_valid(news, field='headline_selector'):
                        redis_client.insert(news)
                    else:
                        logging.debug('Ignoring invalid news with url: {}'.
                                      format(news['url']))
        time.sleep(300)
Beispiel #2
0
def create_template_corpus(lang='de'):
    """Creates a template corpus where for each news url, the headline css
    selector and wayback_url of the news is stored.

    # Arguments:
        lang: language of the corpus
    """
    redis_client = RedisClient(lang=lang)
    base_url = get_base_url(lang=lang)
    while True:
        story_urls = get_story_urls(base_url)
        for url in story_urls:
            story = parse_qs(urlparse(url).query, keep_blank_values=True)['q']
            story = unicode(story[0])
            logging.info('Processing story "{}"'.format(
                (story.decode('utf-8'))))
            for news in build_news(url):
                if news:
                    news = append_html(news, redis_client)
                    news = append_headline_selector(news)
                    if is_valid(news, field='headline_selector'):
                        redis_client.insert(news)
                    else:
                        logging.debug(
                            'Ignoring invalid news with url: {}'.format(
                                news['url']))
        time.sleep(300)
Beispiel #3
0
    def test_bs4(self):
        from bs4 import BeautifulSoup
        import os
        from util import is_valid
        from tokenizer import Tokenizer
        import json

        base_path = 'C:\\Users\\Jun-Wei\\Desktop\\webpages_raw'
        book_file = 'bookkeeping.json'
        upper, lower = '20', '289'
        with open(os.path.join(base_path, book_file), 'r',
                  encoding='utf8') as f:
            book_data = json.load(f)
        url = book_data[upper + '/' + lower]
        print(url)
        if not is_valid(url):
            print('invalid')
        else:
            with open(os.path.join(base_path, upper, lower),
                      'r',
                      encoding='utf8') as f:
                soup = BeautifulSoup(f.read(), 'html5lib')
                #if soup.title:
                #    print('Title', soup.find('title').text)
                if soup.find_all('a'):
                    token_data = Tokenizer.tokenize_link(
                        url, soup.find_all('a'))
                '''
Beispiel #4
0
 def preprocess(self, key_from, key_to):
     '''
     Generate sanitized text files for each document. Ignore invalid files.
     For each document, the text files for Title, Anchor and Body are generated
     '''
     with open(os.path.join(self.base_dir, self.book_file),
               'r',
               encoding='utf8') as f:
         book_data = json.load(f)
     for key, url in book_data.items():
         upper, lower = key.split('/')  # e.g. '0/1'
         if int(upper) > int(key_to) or int(upper) < int(key_from):
             continue
         if not util.is_valid(url) or os.path.exists(
                 os.path.join(self.output_dir, upper, lower + '.body')):
             print(key, 'invalid')
             continue
         print('processing:', key)
         with open(os.path.join(self.base_dir, upper, lower),
                   'r',
                   encoding='utf8') as f:
             soup = BeautifulSoup(f.read(), 'html5lib')
             if soup.title:
                 #print('-- title --')
                 tokens = Tokenizer.tokenize(soup.title.text)
                 #print(soup.title, tokens)
                 self.save_text(
                     os.path.join(self.output_dir, upper, lower + '.title'),
                     tokens)
             if soup.find_all('a'):
                 #print('-- a --')
                 link_data = Tokenizer.tokenize_link(
                     url, soup.find_all('a'))
                 self.save_json(
                     os.path.join(self.output_dir, upper,
                                  lower + '.link.json'), link_data)
             if soup.body:
                 #print('-- body --')
                 #print(soup.body)
                 txt = ' '.join([s for s in soup.body.stripped_strings])
                 #print(txt)
                 #print('---')
                 for script in soup.body.find_all('script'):
                     fragment = ' '.join(
                         [s for s in script.stripped_strings])
                     txt = txt.replace(fragment, '')
                 for style in soup.body.find_all('style'):
                     fragment = ' '.join(
                         [s for s in style.stripped_strings])
                     txt = txt.replace(fragment, '')
                 #print(txt)
                 tokens = Tokenizer.tokenize(txt)
                 #print('tokens:', tokens)
                 self.save_text(
                     os.path.join(self.output_dir, upper, lower + '.body'),
                     tokens)
Beispiel #5
0
 def make_pairs(self, message):
     words = message.split()
     words = list(filter(lambda word: is_valid(word), words))
     start_index = -1 if len(words) > 0 else 0
     for i in range(start_index, len(words)):
         if i == -1:
             yield (START, words[i + 1])
         elif i == len(words) - 1:
             yield (words[i], END)
         else:
             yield (words[i], words[i + 1])
Beispiel #6
0
 def load_doc(self):
     index = 0
     for key, url in self.book_data.items():
         print(key)
         if not util.is_valid(url):
             continue
         upper, lower = key.split('/')  # e.g. '0/1'
         if os.path.exists(os.path.join(self.doc_dir, upper, lower + '.' + self.index_type)):
             with open(os.path.join(self.doc_dir, upper, lower + '.' + self.index_type), 'r', encoding='utf8') as f:
                 self.docs.append(f.read().split())
                 self.index_to_key[str(index)] = key
                 index += 1
         #print(len(self.docs))
         #print(self.index_to_key)
         #input()
     self.save_json(self.index_to_key, self.index_type + '_index_to_key.json')
     self.build_dict(self.index_type + '.dict')
Beispiel #7
0
def populate_template_corpus(lang='de'):
    """Populates the news with required fields and write them to json files.
    For each news object a json file which has the id of news is created

    # Arguments:
        lang: language of the corpus
    """
    redis_client = RedisClient(lang=lang)
    for news in template.populate(redis_client):
        if not is_valid(news, field='headline'):
            continue
        base = 'docs/' + lang + '/'
        filename = base + news['id'] + '.json'
        with io.open(filename, 'w', encoding='utf8') as json_file:
            data = json.dumps(news,
                              ensure_ascii=False,
                              encoding='utf8',
                              indent=4)
            logging.info('Wrote document to disk: id={}'.format(news['id']))
            json_file.write(unicode(data))
Beispiel #8
0
def populate_template_corpus(lang='de'):
    """Populates the news with required fields and write them to json files.
    For each news object a json file which has the id of news is created

    # Arguments:
        lang: language of the corpus
    """
    redis_client = RedisClient(lang=lang)
    for news in template.populate(redis_client):
        if not is_valid(news, field='headline'):
            continue
        base = 'docs/' + lang + '/'
        filename = base + news['id'] + '.json'
        with io.open(filename, 'w', encoding='utf8') as json_file:
            data = json.dumps(news,
                              ensure_ascii=False,
                              encoding='utf8',
                              indent=4)
            logging.info('Wrote document to disk: id={}'.format(news['id']))
            json_file.write(unicode(data))
Beispiel #9
0
def server(p):

    s.bind(('localhost', p))        # Bind to the port
    s.listen(5)                 # Now wait for client connection, takes backlog - number of connections to queue
    while True:

        print('New server listening on port {0}'.format(p))
        c, addr = s.accept()     # Establish connection with client.

        resp_obj = {}

        try:
            # Validate method and socket
            thing = read_obj(c)
            if thing['method'] not in ['GET', 'POST']:
                resp_obj['response'] = "error: invalid command"
                send_obj(resp_obj, c)
                c.close()
        except:
            resp_obj['response'] = "error: failed to read from socket"
            send_obj(resp_obj, c)
            c.close()

        method = thing['method']
        group = thing['group']

        print("Got {} request from {} on port {} for group {}"\
                .format(method, addr[0], addr[1], group))

        if method == 'GET':             # Handle GET Request
            # Check for valid group
            if is_valid(group) and group in chat_groups:
                # print('Got group: ', group)
                resp_obj['response'] = "ok"
                send_obj(resp_obj, c)
            else:
                resp_obj['response'] = "error: invalid group name"
                send_obj(resp_obj, c)
                c.close()
                continue

            send_obj(chat_groups[group], c)

        elif method == 'POST':          # Handle POST Request
            try:
                # Check for valid group
                if is_valid(group):
                    # print('Got group: ', group)
                    resp_obj['response'] = "ok"
                    send_obj(resp_obj, c)
                else:
                    resp_obj['response'] = "error: invalid group name"
                    send_obj(resp_obj, c)
                    c.close()
                    continue

                # Now check for valid user
                thing = read_obj(c)
                id = thing['id']
                ip = addr[0]
                port = addr[1]

                if is_valid(id):
                    # print('Got ID: ', id)
                    resp_obj['response'] = "ok"
                    send_obj(resp_obj, c)
                else:
                    resp_obj['response'] = "error: invalid user name"
                    send_obj(resp_obj, c)
                    c.close()
                    continue

                # Now grab message from user
                thing = read_obj(c)
                msg = thing['_msg']

                # Initialize group in dict if it's new
                if not group in chat_groups:
                    chat_groups[group] = []

                # Create and record a post log
                post = {}
                post['header'] = "From {0} /{1}:{2} {3}"\
                    .format(id, addr[0], addr[1],
                            datetime.datetime.now().\
                            strftime('%a %b %d %H:%M:%S EST %Y'))
                post['message'] = msg

                chat_groups[group].append(post)

                # print "--------------"
                # print str(chat_groups)

            except Exception as e:
                print('something went wrong, maybe the client connection closed. ', e)

            # Finally, close the connection
            c.close()
Beispiel #10
0
#!/usr/bin/env python3
import csv
import pathlib
import sys

from util import is_valid

CSV_DATA = pathlib.Path('data/csv')
for f in CSV_DATA.glob('**/*.csv'):
    base = f.parts[-1]
    course_info = base.strip('.csv').split('-')
    print('Reading {} ...'.format(base))
    with f.open('rU') as csv_file:
        reader = csv.reader(csv_file)
        headings = next(reader)  # Skip the headings
        rows = list(reader)
        # Validate the CSV file
        if not is_valid(rows, headings):
            sys.exit(1)
sys.exit(0)
Beispiel #11
0
def parse_and_write_npcinfos(output_dir: str):
    npc_infos = NpcInfos()

    site = pywikibot.Site(fam=OSRSFamily(), code='en')
    category = pywikibot.Category(site, 'Category:Monsters')
    gen = pagegenerators.PreloadingGenerator(category.articles())
    for page in gen:
        code = mwparserfromhell.parse(page.get(), skip_style_tags=True)

        for infobox in util.get_infobox_versions(code):
            # Skips this infobox if the "removal" key is set.
            if 'removal' in infobox and not str(infobox['removal']).strip().lower() in ['', 'no']:
                continue

            # Skips this infobox if there is no ID attribute.
            if 'id' not in infobox:
                continue
                
            # Parses string ID fields into an array of ints.
            ids = [int(npc_id) for npc_id in str(infobox['id']).split(',') if npc_id != '' and npc_id.isdigit()]

            # Deletes all unknown fields and fields with invalid values.
            known_fields = list(map(lambda f: f.name, NpcInfo.DESCRIPTOR.fields))
            filtered_infobox = {k: infobox[k] for k in known_fields if k in infobox and util.is_valid(infobox[k])}

            try:
                npc_info = ParseDict(filtered_infobox, NpcInfo())
                npc_info.ids[:] = ids
                npc_infos.npcs.append(npc_info)
            except ParseError:
                warn('Failed to parse JSON into NpcInfo proto: {}'.format(filtered_infobox))

    output_filename = os.path.join(output_dir, 'npc_infos')
    util.write_proto(npc_infos, output_filename)
    print('{0} NpcInfos written to: {1}.binarypb and {1}.textproto'.format(len(npc_infos.npcs), output_filename))