def transform_row_to_rayyan(irow): orow = {} orow['title'] = irow['title'] orow['abstract'] = irow['abstract'] #orow['url'] = DX_DOI_PREFIX + irow['doi'] orow['url'] = irow['url'] orow['pmc_id'] = irow['pmcid'] orow['pubmed_id'] = irow['pubmed_id'] publish_time = irow['publish_time'].strip() try: # First, try parsing as a daterange. # This should catch most date formats except # those in 'DD-MM-YY' and some other forms. start, end = rangeparse(publish_time) except: # If parsing as daterange fails, select # the first word. It's usually 'YYYY' or 'DD-MM-YY' # which is good enough. publish_time = publish_time.split(' ')[0] if publish_time: try: # Try another parse as daterange start, end = rangeparse(publish_time) except: # If that fails, then it is ''DD-MM-YY', # which can be picked up by normalparse. start = normalparse(publish_time) orow['year'] = start.year orow['month'] = start.month orow['day'] = start.day else: orow['year'] = '' orow['month'] = '' orow['day'] = '' # Inital dataset had authors in a list form. # Try parsing authors to see if it's a list. try: authors = ast.literal_eval(irow['authors']) if type(authors) == list: orow['authors'] = '; '.join(authors) else: raise RuntimeError except: # It's not a list, use the string as is. orow['authors'] = irow['authors'] orow['journal'] = irow['journal'] notes = [] for col in ['cord_uid', 'sha', 'doi', 'source_x', 'license', 'mag_id', 'who_covidence_id', 'pdf_json_files','pmc_json_files','s2_id']: notes.append(col + ': ' + irow[col]) orow['notes'] = '; '.join(notes) return orow
def init_tournaments_csv(self): csv_f = open('/tmp/wizards_tournaments.csv', 'rb') csv_ur = UnicodeReader(csv_f) row = next(csv_ur) # skip first row, it has headings row = next(csv_ur) while row is not None: for idx in range(0, len(row)): for sillydash in [ '\u2010', '\u2011', '\u2012', '\u2013', '\\u2014', '\\u2015', '\\u2212' ]: if row[idx].find(sillydash) > -1: row[idx] = row[idx].replace(sillydash, '-') fmt = 'Not Supported' for supfmt in [ 'Modern', 'Standard', 'Commander', 'Legacy', 'Tiny Leaders' ]: if row[5].find(supfmt) > -1: fmt = supfmt break st_date, end_date = rangeparse(row[6]) row[3] = row[3].replace('*', '').strip() tourn = { 'event': row[1], 'city': row[3], 'format': fmt, 'start_date': st_date, 'end_date': end_date } try: sys.stderr.write("tourn: {}\n".format(tourn)) except exceptions.UnicodeEncodeError as uee: sys.stderr.write("FOOLISH PYTHON\n") row = next(csv_ur) return dict()
def init_tournaments_csv(self): csv_f = open('/tmp/wizards_tournaments.csv', 'rb') csv_ur = UnicodeReader(csv_f) row = csv_ur.next() # skip first row, it has headings row = csv_ur.next() while row is not None: for idx in range(0, len(row)): for sillydash in [u'\u2010', u'\u2011', u'\u2012', u'\u2013', '\u2014', '\u2015', '\u2212']: if row[idx].find(sillydash) > -1: row[idx] = row[idx].replace(sillydash, u'-') fmt = 'Not Supported' for supfmt in ['Modern', 'Standard', 'Commander', 'Legacy', 'Tiny Leaders']: if row[5].find(supfmt) > -1: fmt = supfmt break st_date, end_date = rangeparse(row[6]) row[3] = row[3].replace('*', '').strip() tourn = {'event': row[1], 'city': row[3], 'format': fmt, 'start_date': st_date, 'end_date': end_date } try: sys.stderr.write("tourn: {}\n".format(tourn)) except exceptions.UnicodeEncodeError as uee: sys.stderr.write("FOOLISH PYTHON\n") row = csv_ur.next() return dict()
def parse_start_url(self, response): #self.log('Response for URL "{}", which has flags "{}"'.format(response.url, response.flags)) if response.url == 'http://magic.wizards.com/en/events/coverage': # go through this document to create valid tournaments # for url in response.xpath('//a/@href').extract(): for bloop in response.xpath('//p').extract(): try: #self.log('This is a "{}"'.format(str(bloop))) pass except exceptions.UnicodeEncodeError: pass p_match = re.compile( '<p><(strong|b)>([^<]+)</(strong|b)>(.+)</p>$', re.U).match(bloop) if p_match: event_type_name = remove_tags(p_match.group(2)) stuff = p_match.group(4) lines = stuff.split('<br>') for line in lines: line_re = re.compile( r'href="([^"]+)">(.+)</a> \(([^\)]+)\)([^A-Za-z]+([A-Z].+))?', re.U) line_match = line_re.search(line) if line_match: name = remove_tags(line_match.group(2)) fmt = 'Not Supported' try: sys.stderr.write("LINE: '{}'\n".format(line)) except exceptions.UnicodeEncodeError: sys.stderr.write( "I HATE PYTHON UNICODE SUPPORT\n") if line_match.group(5) is not None: for supfmt in [ 'Modern', 'Standard', 'Commander', 'Tiny Leaders' ]: if line_match.group(5).find(supfmt) > -1: fmt = supfmt break if event_type_name == 'Grand Prix': name = 'Grand Prix {}'.format(name) if event_type_name == 'Pro Tour': name = 'Pro Tour {}'.format(name) dates_part = line_match.group(3) if dates_part == 'December 2-3, 7, 2014': dates_part = 'December 2-7, 2014' clean_start_date = None clean_end_date = None try: clean_start_date, clean_end_date = rangeparse( dates_part) except pyparsing.ParseException: pass if clean_start_date is not None and clean_start_date.year > 2010: if clean_end_date is None: clean_end_date = clean_start_date url = line_match.group(1) if url.find('http') < 0: url = 'http://magic.wizards.com{}'.format( url) ti = TournamentItem( name=name, url=url, tournament_format=fmt, start_date=clean_start_date, end_date=clean_end_date) yield ti else: # looking for decks on pages like https://magic.wizards.com/en/events/coverage/2018natus/top-8-decklists-2018-07-01 self.log("Let's try this...") if len(response.selector.xpath('//div[@class="deck-group"]')) > 0: # this page has deck listings on it! ti = TournamentItem() # let's get the event name and URL, if we can. breadcrumb_tournament = response.selector.xpath( '//div[@id="breadcrumb"]/span[not(@class="current")][last()]/a' ) if len(breadcrumb_tournament) > 0: self.log("breadcrumb_tournament = {}".format( breadcrumb_tournament)) self.log("breadcrumb_tournament len = {}".format( len(breadcrumb_tournament))) ti['name'] = breadcrumb_tournament.xpath( './/text()').extract()[0] ti['url'] = breadcrumb_tournament.xpath( './/@href').extract()[0] # and now try to figure out the date posted_in = response.selector.xpath( '//p[@class="posted-in"]/text()').extract() for val in posted_in: dre_match = DATE_RE.search(val) if dre_match: tdate = dateparser.parse(dre_match.group(1)).date() self.log("date is = {}".format(tdate)) ti['start_date'] = tdate ti['end_date'] = tdate break # and now the format... format_sels = response.selector.xpath( '//div[@id="content-detail-page-of-an-article"]/p/text()') ti['tournament_format'] = None for format_sel in format_sels: if ti['tournament_format'] is None: val = format_sel.extract() if 'Legacy' in val: ti['tournament_format'] = 'Legacy' if 'Standard' in val: ti['tournament_format'] = 'Standard' if 'Modern' in val: ti['tournament_format'] = 'Modern' self.log("TournamentItem is {}".format(ti)) # BOOKMARK - so, if I think I have a valid TournamentItem, I need to yield it page_place = 1 for deckgroup_selector in response.selector.xpath( '//div[@class="deck-group"]'): self.parse_deckgroup(response, deckgroup_selector, page_place) page_place += 1
def parse_start_url(self, response): #self.log('Response for URL "{}", which has flags "{}"'.format(response.url, response.flags)) if response.url == 'http://magic.wizards.com/en/events/coverage': # go through this document to create valid tournaments # for url in response.xpath('//a/@href').extract(): for bloop in response.xpath('//p').extract(): try: #self.log('This is a "{}"'.format(str(bloop))) pass except exceptions.UnicodeEncodeError: pass p_match = re.compile('<p><(strong|b)>([^<]+)</(strong|b)>(.+)</p>$', re.U).match(bloop) if p_match: event_type_name = remove_tags(p_match.group(2)) stuff = p_match.group(4) lines = stuff.split('<br>') for line in lines: line_re = re.compile(r'href="([^"]+)">(.+)</a> \(([^\)]+)\)([^A-Za-z]+([A-Z].+))?', re.U) line_match = line_re.search(line) if line_match: name = remove_tags(line_match.group(2)) fmt = 'Not Supported' try: sys.stderr.write("LINE: '{}'\n".format(line)) except exceptions.UnicodeEncodeError: sys.stderr.write("I HATE PYTHON UNICODE SUPPORT\n") if line_match.group(5) is not None: for supfmt in ['Modern', 'Standard', 'Commander', 'Tiny Leaders']: if line_match.group(5).find(supfmt) > -1: fmt = supfmt break if event_type_name == 'Grand Prix': name = u'Grand Prix {}'.format(name) if event_type_name == 'Pro Tour': name = u'Pro Tour {}'.format(name) dates_part = line_match.group(3) if dates_part == 'December 2-3, 7, 2014': dates_part = 'December 2-7, 2014' clean_start_date = None clean_end_date = None try: clean_start_date, clean_end_date = rangeparse(dates_part) except pyparsing.ParseException: pass if clean_start_date is not None and clean_start_date.year > 2010: if clean_end_date is None: clean_end_date = clean_start_date url = line_match.group(1) if url.find('http') < 0: url = 'http://magic.wizards.com{}'.format(url) ti = TournamentItem(name=name, url=url, tournament_format=fmt, start_date=clean_start_date, end_date=clean_end_date) yield ti else: # looking for decks on pages like https://magic.wizards.com/en/events/coverage/2018natus/top-8-decklists-2018-07-01 self.log("Let's try this...") if len(response.selector.xpath('//div[@class="deck-group"]')) > 0: # this page has deck listings on it! ti = TournamentItem() # let's get the event name and URL, if we can. breadcrumb_tournament = response.selector.xpath('//div[@id="breadcrumb"]/span[not(@class="current")][last()]/a') if len(breadcrumb_tournament) > 0: self.log("breadcrumb_tournament = {}".format(breadcrumb_tournament)) self.log("breadcrumb_tournament len = {}".format(len(breadcrumb_tournament))) ti['name'] = breadcrumb_tournament.xpath('.//text()').extract()[0] ti['url'] = breadcrumb_tournament.xpath('.//@href').extract()[0] # and now try to figure out the date posted_in = response.selector.xpath('//p[@class="posted-in"]/text()').extract() for val in posted_in: dre_match = DATE_RE.search(val) if dre_match: tdate = dateparser.parse(dre_match.group(1)).date() self.log("date is = {}".format(tdate)) ti['start_date'] = tdate ti['end_date'] = tdate break # and now the format... format_sels = response.selector.xpath('//div[@id="content-detail-page-of-an-article"]/p/text()') ti['tournament_format'] = None for format_sel in format_sels: if ti['tournament_format'] is None: val = format_sel.extract() if 'Legacy' in val: ti['tournament_format'] = 'Legacy' if 'Standard' in val: ti['tournament_format'] = 'Standard' if 'Modern' in val: ti['tournament_format'] = 'Modern' self.log("TournamentItem is {}".format(ti)) # BOOKMARK - so, if I think I have a valid TournamentItem, I need to yield it page_place = 1 for deckgroup_selector in response.selector.xpath('//div[@class="deck-group"]'): self.parse_deckgroup(response, deckgroup_selector, page_place) page_place += 1