def create_template_corpus(lang='de'): """Creates a template corpus where for each news url, the headline css selector and wayback_url of the news is stored. # Arguments: lang: language of the corpus """ redis_client = RedisClient(lang=lang) base_url = get_base_url(lang=lang) while True: story_urls = get_story_urls(base_url) for url in story_urls: story = parse_qs(urlparse(url).query, keep_blank_values=True)['q'] story = unicode(story[0]) logging.info('Processing story "{}"'. format((story.decode('utf-8')))) for news in build_news(url): if news: news = append_html(news, redis_client) news = append_headline_selector(news) if is_valid(news, field='headline_selector'): redis_client.insert(news) else: logging.debug('Ignoring invalid news with url: {}'. format(news['url'])) time.sleep(300)
def create_template_corpus(lang='de'): """Creates a template corpus where for each news url, the headline css selector and wayback_url of the news is stored. # Arguments: lang: language of the corpus """ redis_client = RedisClient(lang=lang) base_url = get_base_url(lang=lang) while True: story_urls = get_story_urls(base_url) for url in story_urls: story = parse_qs(urlparse(url).query, keep_blank_values=True)['q'] story = unicode(story[0]) logging.info('Processing story "{}"'.format( (story.decode('utf-8')))) for news in build_news(url): if news: news = append_html(news, redis_client) news = append_headline_selector(news) if is_valid(news, field='headline_selector'): redis_client.insert(news) else: logging.debug( 'Ignoring invalid news with url: {}'.format( news['url'])) time.sleep(300)
def test_bs4(self): from bs4 import BeautifulSoup import os from util import is_valid from tokenizer import Tokenizer import json base_path = 'C:\\Users\\Jun-Wei\\Desktop\\webpages_raw' book_file = 'bookkeeping.json' upper, lower = '20', '289' with open(os.path.join(base_path, book_file), 'r', encoding='utf8') as f: book_data = json.load(f) url = book_data[upper + '/' + lower] print(url) if not is_valid(url): print('invalid') else: with open(os.path.join(base_path, upper, lower), 'r', encoding='utf8') as f: soup = BeautifulSoup(f.read(), 'html5lib') #if soup.title: # print('Title', soup.find('title').text) if soup.find_all('a'): token_data = Tokenizer.tokenize_link( url, soup.find_all('a')) '''
def preprocess(self, key_from, key_to): ''' Generate sanitized text files for each document. Ignore invalid files. For each document, the text files for Title, Anchor and Body are generated ''' with open(os.path.join(self.base_dir, self.book_file), 'r', encoding='utf8') as f: book_data = json.load(f) for key, url in book_data.items(): upper, lower = key.split('/') # e.g. '0/1' if int(upper) > int(key_to) or int(upper) < int(key_from): continue if not util.is_valid(url) or os.path.exists( os.path.join(self.output_dir, upper, lower + '.body')): print(key, 'invalid') continue print('processing:', key) with open(os.path.join(self.base_dir, upper, lower), 'r', encoding='utf8') as f: soup = BeautifulSoup(f.read(), 'html5lib') if soup.title: #print('-- title --') tokens = Tokenizer.tokenize(soup.title.text) #print(soup.title, tokens) self.save_text( os.path.join(self.output_dir, upper, lower + '.title'), tokens) if soup.find_all('a'): #print('-- a --') link_data = Tokenizer.tokenize_link( url, soup.find_all('a')) self.save_json( os.path.join(self.output_dir, upper, lower + '.link.json'), link_data) if soup.body: #print('-- body --') #print(soup.body) txt = ' '.join([s for s in soup.body.stripped_strings]) #print(txt) #print('---') for script in soup.body.find_all('script'): fragment = ' '.join( [s for s in script.stripped_strings]) txt = txt.replace(fragment, '') for style in soup.body.find_all('style'): fragment = ' '.join( [s for s in style.stripped_strings]) txt = txt.replace(fragment, '') #print(txt) tokens = Tokenizer.tokenize(txt) #print('tokens:', tokens) self.save_text( os.path.join(self.output_dir, upper, lower + '.body'), tokens)
def make_pairs(self, message): words = message.split() words = list(filter(lambda word: is_valid(word), words)) start_index = -1 if len(words) > 0 else 0 for i in range(start_index, len(words)): if i == -1: yield (START, words[i + 1]) elif i == len(words) - 1: yield (words[i], END) else: yield (words[i], words[i + 1])
def load_doc(self): index = 0 for key, url in self.book_data.items(): print(key) if not util.is_valid(url): continue upper, lower = key.split('/') # e.g. '0/1' if os.path.exists(os.path.join(self.doc_dir, upper, lower + '.' + self.index_type)): with open(os.path.join(self.doc_dir, upper, lower + '.' + self.index_type), 'r', encoding='utf8') as f: self.docs.append(f.read().split()) self.index_to_key[str(index)] = key index += 1 #print(len(self.docs)) #print(self.index_to_key) #input() self.save_json(self.index_to_key, self.index_type + '_index_to_key.json') self.build_dict(self.index_type + '.dict')
def populate_template_corpus(lang='de'): """Populates the news with required fields and write them to json files. For each news object a json file which has the id of news is created # Arguments: lang: language of the corpus """ redis_client = RedisClient(lang=lang) for news in template.populate(redis_client): if not is_valid(news, field='headline'): continue base = 'docs/' + lang + '/' filename = base + news['id'] + '.json' with io.open(filename, 'w', encoding='utf8') as json_file: data = json.dumps(news, ensure_ascii=False, encoding='utf8', indent=4) logging.info('Wrote document to disk: id={}'.format(news['id'])) json_file.write(unicode(data))
def server(p): s.bind(('localhost', p)) # Bind to the port s.listen(5) # Now wait for client connection, takes backlog - number of connections to queue while True: print('New server listening on port {0}'.format(p)) c, addr = s.accept() # Establish connection with client. resp_obj = {} try: # Validate method and socket thing = read_obj(c) if thing['method'] not in ['GET', 'POST']: resp_obj['response'] = "error: invalid command" send_obj(resp_obj, c) c.close() except: resp_obj['response'] = "error: failed to read from socket" send_obj(resp_obj, c) c.close() method = thing['method'] group = thing['group'] print("Got {} request from {} on port {} for group {}"\ .format(method, addr[0], addr[1], group)) if method == 'GET': # Handle GET Request # Check for valid group if is_valid(group) and group in chat_groups: # print('Got group: ', group) resp_obj['response'] = "ok" send_obj(resp_obj, c) else: resp_obj['response'] = "error: invalid group name" send_obj(resp_obj, c) c.close() continue send_obj(chat_groups[group], c) elif method == 'POST': # Handle POST Request try: # Check for valid group if is_valid(group): # print('Got group: ', group) resp_obj['response'] = "ok" send_obj(resp_obj, c) else: resp_obj['response'] = "error: invalid group name" send_obj(resp_obj, c) c.close() continue # Now check for valid user thing = read_obj(c) id = thing['id'] ip = addr[0] port = addr[1] if is_valid(id): # print('Got ID: ', id) resp_obj['response'] = "ok" send_obj(resp_obj, c) else: resp_obj['response'] = "error: invalid user name" send_obj(resp_obj, c) c.close() continue # Now grab message from user thing = read_obj(c) msg = thing['_msg'] # Initialize group in dict if it's new if not group in chat_groups: chat_groups[group] = [] # Create and record a post log post = {} post['header'] = "From {0} /{1}:{2} {3}"\ .format(id, addr[0], addr[1], datetime.datetime.now().\ strftime('%a %b %d %H:%M:%S EST %Y')) post['message'] = msg chat_groups[group].append(post) # print "--------------" # print str(chat_groups) except Exception as e: print('something went wrong, maybe the client connection closed. ', e) # Finally, close the connection c.close()
#!/usr/bin/env python3 import csv import pathlib import sys from util import is_valid CSV_DATA = pathlib.Path('data/csv') for f in CSV_DATA.glob('**/*.csv'): base = f.parts[-1] course_info = base.strip('.csv').split('-') print('Reading {} ...'.format(base)) with f.open('rU') as csv_file: reader = csv.reader(csv_file) headings = next(reader) # Skip the headings rows = list(reader) # Validate the CSV file if not is_valid(rows, headings): sys.exit(1) sys.exit(0)
def parse_and_write_npcinfos(output_dir: str): npc_infos = NpcInfos() site = pywikibot.Site(fam=OSRSFamily(), code='en') category = pywikibot.Category(site, 'Category:Monsters') gen = pagegenerators.PreloadingGenerator(category.articles()) for page in gen: code = mwparserfromhell.parse(page.get(), skip_style_tags=True) for infobox in util.get_infobox_versions(code): # Skips this infobox if the "removal" key is set. if 'removal' in infobox and not str(infobox['removal']).strip().lower() in ['', 'no']: continue # Skips this infobox if there is no ID attribute. if 'id' not in infobox: continue # Parses string ID fields into an array of ints. ids = [int(npc_id) for npc_id in str(infobox['id']).split(',') if npc_id != '' and npc_id.isdigit()] # Deletes all unknown fields and fields with invalid values. known_fields = list(map(lambda f: f.name, NpcInfo.DESCRIPTOR.fields)) filtered_infobox = {k: infobox[k] for k in known_fields if k in infobox and util.is_valid(infobox[k])} try: npc_info = ParseDict(filtered_infobox, NpcInfo()) npc_info.ids[:] = ids npc_infos.npcs.append(npc_info) except ParseError: warn('Failed to parse JSON into NpcInfo proto: {}'.format(filtered_infobox)) output_filename = os.path.join(output_dir, 'npc_infos') util.write_proto(npc_infos, output_filename) print('{0} NpcInfos written to: {1}.binarypb and {1}.textproto'.format(len(npc_infos.npcs), output_filename))