Exemple #1
0
def transform_row_to_rayyan(irow):
    orow = {}

    orow['title'] = irow['title']
    orow['abstract'] = irow['abstract']
    #orow['url'] = DX_DOI_PREFIX + irow['doi']
    orow['url'] = irow['url']
    orow['pmc_id'] = irow['pmcid']
    orow['pubmed_id'] = irow['pubmed_id']

    publish_time = irow['publish_time'].strip()
    try:
      # First, try parsing as a daterange.
      # This should catch most date formats except
      # those in 'DD-MM-YY' and some other forms.
      start, end = rangeparse(publish_time)
    except:
      # If parsing as daterange fails, select
      # the first word. It's usually 'YYYY' or 'DD-MM-YY'
      # which is good enough.
      publish_time = publish_time.split(' ')[0]

    if publish_time:
      try:
        # Try another parse as daterange
        start, end = rangeparse(publish_time)
      except:
        # If that fails, then it is ''DD-MM-YY',
        # which can be picked up by normalparse.
        start = normalparse(publish_time)

      orow['year'] = start.year
      orow['month'] = start.month
      orow['day'] = start.day
    else:
      orow['year'] = ''
      orow['month'] = ''
      orow['day'] = ''

    # Inital dataset had authors in a list form.
    # Try parsing authors to see if it's a list.
    try:
      authors = ast.literal_eval(irow['authors'])
      if type(authors) == list:
        orow['authors'] = '; '.join(authors)
      else:
        raise RuntimeError
    except:
      # It's not a list, use the string as is.
      orow['authors'] = irow['authors']

    orow['journal'] = irow['journal']

    notes = []
    for col in ['cord_uid', 'sha', 'doi', 'source_x', 'license', 'mag_id', 'who_covidence_id', 'pdf_json_files','pmc_json_files','s2_id']:
      notes.append(col + ': ' + irow[col])

    orow['notes'] = '; '.join(notes)
    return orow
Exemple #2
0
 def init_tournaments_csv(self):
     csv_f = open('/tmp/wizards_tournaments.csv', 'rb')
     csv_ur = UnicodeReader(csv_f)
     row = next(csv_ur)  # skip first row, it has headings
     row = next(csv_ur)
     while row is not None:
         for idx in range(0, len(row)):
             for sillydash in [
                     '\u2010', '\u2011', '\u2012', '\u2013', '\\u2014',
                     '\\u2015', '\\u2212'
             ]:
                 if row[idx].find(sillydash) > -1:
                     row[idx] = row[idx].replace(sillydash, '-')
         fmt = 'Not Supported'
         for supfmt in [
                 'Modern', 'Standard', 'Commander', 'Legacy', 'Tiny Leaders'
         ]:
             if row[5].find(supfmt) > -1:
                 fmt = supfmt
                 break
         st_date, end_date = rangeparse(row[6])
         row[3] = row[3].replace('*', '').strip()
         tourn = {
             'event': row[1],
             'city': row[3],
             'format': fmt,
             'start_date': st_date,
             'end_date': end_date
         }
         try:
             sys.stderr.write("tourn: {}\n".format(tourn))
         except exceptions.UnicodeEncodeError as uee:
             sys.stderr.write("FOOLISH PYTHON\n")
         row = next(csv_ur)
     return dict()
Exemple #3
0
 def init_tournaments_csv(self):
     csv_f = open('/tmp/wizards_tournaments.csv', 'rb')
     csv_ur = UnicodeReader(csv_f)
     row = csv_ur.next()  # skip first row, it has headings
     row = csv_ur.next()
     while row is not None:
         for idx in range(0, len(row)):
             for sillydash in [u'\u2010', u'\u2011', u'\u2012', u'\u2013', '\u2014', '\u2015', '\u2212']:
                 if row[idx].find(sillydash) > -1:
                     row[idx] = row[idx].replace(sillydash, u'-')
         fmt = 'Not Supported'
         for supfmt in ['Modern', 'Standard', 'Commander', 'Legacy', 'Tiny Leaders']:
             if row[5].find(supfmt) > -1:
                 fmt = supfmt
                 break
         st_date, end_date = rangeparse(row[6])
         row[3] = row[3].replace('*', '').strip()
         tourn = {'event': row[1],
                  'city': row[3],
                  'format': fmt,
                  'start_date': st_date,
                  'end_date': end_date
                  }
         try:
             sys.stderr.write("tourn: {}\n".format(tourn))
         except exceptions.UnicodeEncodeError as uee:
             sys.stderr.write("FOOLISH PYTHON\n")
         row = csv_ur.next()
     return dict()
Exemple #4
0
    def parse_start_url(self, response):
        #self.log('Response for URL "{}", which has flags "{}"'.format(response.url, response.flags))
        if response.url == 'http://magic.wizards.com/en/events/coverage':
            # go through this document to create valid tournaments
            # for url in response.xpath('//a/@href').extract():
            for bloop in response.xpath('//p').extract():
                try:
                    #self.log('This is a "{}"'.format(str(bloop)))
                    pass
                except exceptions.UnicodeEncodeError:
                    pass
                p_match = re.compile(
                    '<p><(strong|b)>([^<]+)</(strong|b)>(.+)</p>$',
                    re.U).match(bloop)
                if p_match:
                    event_type_name = remove_tags(p_match.group(2))
                    stuff = p_match.group(4)
                    lines = stuff.split('<br>')
                    for line in lines:
                        line_re = re.compile(
                            r'href="([^"]+)">(.+)</a> \(([^\)]+)\)([^A-Za-z]+([A-Z].+))?',
                            re.U)
                        line_match = line_re.search(line)
                        if line_match:
                            name = remove_tags(line_match.group(2))
                            fmt = 'Not Supported'
                            try:
                                sys.stderr.write("LINE: '{}'\n".format(line))
                            except exceptions.UnicodeEncodeError:
                                sys.stderr.write(
                                    "I HATE PYTHON UNICODE SUPPORT\n")

                            if line_match.group(5) is not None:
                                for supfmt in [
                                        'Modern', 'Standard', 'Commander',
                                        'Tiny Leaders'
                                ]:
                                    if line_match.group(5).find(supfmt) > -1:
                                        fmt = supfmt
                                        break
                            if event_type_name == 'Grand Prix':
                                name = 'Grand Prix {}'.format(name)
                            if event_type_name == 'Pro Tour':
                                name = 'Pro Tour {}'.format(name)
                            dates_part = line_match.group(3)
                            if dates_part == 'December 2-3, 7, 2014':
                                dates_part = 'December 2-7, 2014'
                            clean_start_date = None
                            clean_end_date = None
                            try:
                                clean_start_date, clean_end_date = rangeparse(
                                    dates_part)
                            except pyparsing.ParseException:
                                pass
                            if clean_start_date is not None and clean_start_date.year > 2010:
                                if clean_end_date is None:
                                    clean_end_date = clean_start_date
                                url = line_match.group(1)
                                if url.find('http') < 0:
                                    url = 'http://magic.wizards.com{}'.format(
                                        url)
                                ti = TournamentItem(
                                    name=name,
                                    url=url,
                                    tournament_format=fmt,
                                    start_date=clean_start_date,
                                    end_date=clean_end_date)
                                yield ti
        else:
            # looking for decks on pages like https://magic.wizards.com/en/events/coverage/2018natus/top-8-decklists-2018-07-01
            self.log("Let's try this...")
            if len(response.selector.xpath('//div[@class="deck-group"]')) > 0:
                # this page has deck listings on it!

                ti = TournamentItem()
                # let's get the event name and URL, if we can.
                breadcrumb_tournament = response.selector.xpath(
                    '//div[@id="breadcrumb"]/span[not(@class="current")][last()]/a'
                )
                if len(breadcrumb_tournament) > 0:
                    self.log("breadcrumb_tournament = {}".format(
                        breadcrumb_tournament))
                    self.log("breadcrumb_tournament len = {}".format(
                        len(breadcrumb_tournament)))
                    ti['name'] = breadcrumb_tournament.xpath(
                        './/text()').extract()[0]
                    ti['url'] = breadcrumb_tournament.xpath(
                        './/@href').extract()[0]

                # and now try to figure out the date
                posted_in = response.selector.xpath(
                    '//p[@class="posted-in"]/text()').extract()
                for val in posted_in:
                    dre_match = DATE_RE.search(val)
                    if dre_match:
                        tdate = dateparser.parse(dre_match.group(1)).date()
                        self.log("date is = {}".format(tdate))
                        ti['start_date'] = tdate
                        ti['end_date'] = tdate
                        break

                # and now the format...
                format_sels = response.selector.xpath(
                    '//div[@id="content-detail-page-of-an-article"]/p/text()')
                ti['tournament_format'] = None
                for format_sel in format_sels:
                    if ti['tournament_format'] is None:
                        val = format_sel.extract()
                        if 'Legacy' in val:
                            ti['tournament_format'] = 'Legacy'
                        if 'Standard' in val:
                            ti['tournament_format'] = 'Standard'
                        if 'Modern' in val:
                            ti['tournament_format'] = 'Modern'

                self.log("TournamentItem is {}".format(ti))

                # BOOKMARK - so, if I think I have a valid TournamentItem, I need to yield it
                page_place = 1
                for deckgroup_selector in response.selector.xpath(
                        '//div[@class="deck-group"]'):
                    self.parse_deckgroup(response, deckgroup_selector,
                                         page_place)
                    page_place += 1
Exemple #5
0
    def parse_start_url(self, response):
        #self.log('Response for URL "{}", which has flags "{}"'.format(response.url, response.flags))
        if response.url == 'http://magic.wizards.com/en/events/coverage':
            # go through this document to create valid tournaments
            # for url in response.xpath('//a/@href').extract():
            for bloop in response.xpath('//p').extract():
                try:
                    #self.log('This is a "{}"'.format(str(bloop)))
                    pass
                except exceptions.UnicodeEncodeError:
                    pass
                p_match = re.compile('<p><(strong|b)>([^<]+)</(strong|b)>(.+)</p>$', re.U).match(bloop)
                if p_match:
                    event_type_name = remove_tags(p_match.group(2))
                    stuff = p_match.group(4)
                    lines = stuff.split('<br>')
                    for line in lines:
                        line_re = re.compile(r'href="([^"]+)">(.+)</a> \(([^\)]+)\)([^A-Za-z]+([A-Z].+))?', re.U)
                        line_match = line_re.search(line)
                        if line_match:
                            name = remove_tags(line_match.group(2))
                            fmt = 'Not Supported'
                            try:
                                sys.stderr.write("LINE: '{}'\n".format(line))
                            except exceptions.UnicodeEncodeError:
                                sys.stderr.write("I HATE PYTHON UNICODE SUPPORT\n")

                            if line_match.group(5) is not None:
                                for supfmt in ['Modern', 'Standard', 'Commander', 'Tiny Leaders']:
                                    if line_match.group(5).find(supfmt) > -1:
                                        fmt = supfmt
                                        break
                            if event_type_name == 'Grand Prix':
                                name = u'Grand Prix {}'.format(name)
                            if event_type_name == 'Pro Tour':
                                name = u'Pro Tour {}'.format(name)
                            dates_part = line_match.group(3)
                            if dates_part == 'December 2-3, 7, 2014':
                                dates_part = 'December 2-7, 2014'
                            clean_start_date = None
                            clean_end_date = None
                            try:
                                clean_start_date, clean_end_date = rangeparse(dates_part)
                            except pyparsing.ParseException:
                                pass
                            if clean_start_date is not None and clean_start_date.year > 2010:
                                if clean_end_date is None:
                                    clean_end_date = clean_start_date
                                url = line_match.group(1)
                                if url.find('http') < 0:
                                    url = 'http://magic.wizards.com{}'.format(url)
                                ti = TournamentItem(name=name,
                                                    url=url,
                                                    tournament_format=fmt,
                                                    start_date=clean_start_date,
                                                    end_date=clean_end_date)
                                yield ti
        else:
            # looking for decks on pages like https://magic.wizards.com/en/events/coverage/2018natus/top-8-decklists-2018-07-01
            self.log("Let's try this...")
            if len(response.selector.xpath('//div[@class="deck-group"]')) > 0:
                # this page has deck listings on it!

                ti = TournamentItem()
                # let's get the event name and URL, if we can.
                breadcrumb_tournament = response.selector.xpath('//div[@id="breadcrumb"]/span[not(@class="current")][last()]/a')
                if len(breadcrumb_tournament) > 0:
                    self.log("breadcrumb_tournament = {}".format(breadcrumb_tournament))
                    self.log("breadcrumb_tournament len = {}".format(len(breadcrumb_tournament)))
                    ti['name'] = breadcrumb_tournament.xpath('.//text()').extract()[0]
                    ti['url'] = breadcrumb_tournament.xpath('.//@href').extract()[0]

                # and now try to figure out the date
                posted_in = response.selector.xpath('//p[@class="posted-in"]/text()').extract()
                for val in posted_in:
                    dre_match = DATE_RE.search(val)
                    if dre_match:
                        tdate = dateparser.parse(dre_match.group(1)).date()
                        self.log("date is = {}".format(tdate))
                        ti['start_date'] = tdate
                        ti['end_date'] = tdate
                        break

                # and now the format...
                format_sels = response.selector.xpath('//div[@id="content-detail-page-of-an-article"]/p/text()')
                ti['tournament_format'] = None
                for format_sel in format_sels:
                    if ti['tournament_format'] is None:
                        val = format_sel.extract()
                        if 'Legacy' in val:
                            ti['tournament_format'] = 'Legacy'
                        if 'Standard' in val:
                            ti['tournament_format'] = 'Standard'
                        if 'Modern' in val:
                            ti['tournament_format'] = 'Modern'

                self.log("TournamentItem is {}".format(ti))

                # BOOKMARK - so, if I think I have a valid TournamentItem, I need to yield it
                page_place = 1
                for deckgroup_selector in response.selector.xpath('//div[@class="deck-group"]'):
                    self.parse_deckgroup(response, deckgroup_selector, page_place)
                    page_place += 1