Esempio n. 1
0
 def test_parse_18(self):
     """It handles a run-on with the label."""
     self.assertEqual(
         COLLECTOR.parse("""Sarah Nunn and Laura Eason 9834"""), [
             Trait(col_name='Sarah Nunn', col_no='9834', start=0, end=31),
             Trait(col_name='Laura Eason', start=0, end=31)
         ])
Esempio n. 2
0
 def test_parse_15(self):
     """It parses collectors separated by 'with'."""
     self.assertEqual(
         COLLECTOR.parse(
             'Sarah Nunn with Angela Brown 7529 20 October 2002 of'),
         [
             Trait(col_name='Sarah Nunn', col_no='7529', start=0, end=33),
             Trait(col_name='Angela Brown', start=0, end=33)
         ])
Esempio n. 3
0
 def test_parse_02(self):
     """It parses a several collectors."""
     self.assertEqual(
         COLLECTOR.parse('Sarah Nunn and S. Jacobs and R. Mc Elderry 9480'),
         [
             Trait(col_name='Sarah Nunn', col_no='9480', start=0, end=47),
             Trait(col_name='S. Jacobs', start=0, end=47),
             Trait(col_name='R. Mc Elderry', start=0, end=47),
         ])
 def test_parse_11(self):
     """It gets a state notation separated from the county."""
     self.assertEqual(
         ADMIN_UNIT.parse(textwrap.dedent("""
             APPALACHIAN STATE UNIVERSITY HERBARIUM
             PLANTS OF NORTH CAROLINA
             STONE MOUNTAIN STATE PARK
             WILKES COUNTY
             """)),
         [Trait(us_state='North Carolina', start=40, end=64),
          Trait(us_county='Wilkes', start=91, end=104)])
Esempio n. 5
0
 def test_parse_16(self):
     """It parses collectors separated by 'with'."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             Collector: Christopher Reid & Sarah Nunn
             No.: 2018 Date: 16 May 2001
             """)), [
                 Trait(col_name='Christopher Reid',
                       col_no='2018',
                       start=1,
                       end=51),
                 Trait(col_name='Sarah Nunn', start=1, end=51)
             ])
 def test_parse_11(self):
     """It handles no spaces between the date parts."""
     self.assertEqual(
         LABEL_DATE.parse(textwrap.dedent("""
             Slender erect shrubs 4-5 m.tall.
             June 7, 1923 xo, 23163""")),
         [Trait(value='1923-06-07', start=34, end=46)])
def convert(token):
    """Build a collector trait"""
    names = regex.split(r'\s*(?:and|with|[,&])\s*',
                        token.group.get('col_name'))

    traits = []

    for name, suffix in zip_longest(names, names[1:], fillvalue=''):
        name = regex.sub(r'\.{3,}.*', '', name)
        if len(name) < MIN_LEN:
            continue

        trait = Trait(start=token.start, end=token.end)
        trait.col_name = name

        if suffix.lower() in name_parts.SUFFIXES:
            trait.col_name = f'{name} {suffix}'

        if name.lower() not in name_parts.SUFFIXES:
            traits.append(trait)

    if not traits:
        return None

    if token.group.get('collector_no'):
        col_no = token.group['collector_no']
        # Temp hack
        if col_no[-1] in ('m', 'M'):
            return None
        traits[0].col_no = col_no

    return squash(traits)
 def test_parse_15(self):
     """It handles a eol between the state label and state."""
     self.assertEqual(
         ADMIN_UNIT.parse(textwrap.dedent("""
             PLANTS OF
             North Carolina
             """)),
         [Trait(us_state='North Carolina', start=1, end=25)])
Esempio n. 9
0
 def test_parse_03(self):
     """It gets the full notation."""
     self.assertEqual(
         PLANT_TAXON.parse("""Cephalanthus occidentalis L. Rubiaceas"""), [
             Trait(value='Cephalanthus occidentalis L. Rubiaceas',
                   start=0,
                   end=38)
         ])
Esempio n. 10
0
 def test_parse_04(self):
     """It handles a bad name."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             APPALACHIAN STATE UNIVERSITY HERBARIUM
             PLANTS OF NORTH CAROLINA
             Collected by _Wayne.. Hutchins.
             """)), [Trait(col_name='Hutchins', start=65, end=95)])
 def test_parse_07(self):
     """It works with noisy text."""
     self.assertEqual(
         ADMIN_UNIT.parse(textwrap.dedent("""
             Cornus drummondii C. A. Mey.
             Hempstead County
             Grandview Prairie; on CR 35, 10 air miles S/SE of Nashville; in
         """)),
         [Trait(us_county='Hempstead', start=30, end=46)])
 def test_parse_10(self):
     """It handles no spaces between the date parts."""
     self.assertEqual(
         LABEL_DATE.parse(textwrap.dedent("""
             Altitude 1000 ft.
             Date 8/20/75
             Collected by ...Wayne. Hutchins No.
         """)),
         [Trait(value='1975-08-20', start=19, end=31)])
Esempio n. 13
0
 def test_parse_13(self):
     """It parses multi-part collector numbers."""
     self.assertEqual(
         COLLECTOR.parse('Coll. Stephen W. Bailey No, SWBII 1)'), [
             Trait(col_name='Stephen W. Bailey',
                   col_no='SWBII 1',
                   start=0,
                   end=35)
         ])
Esempio n. 14
0
 def test_parse_10(self):
     """It handles more random words matching names."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             WGS84. Flevation: 1,524 m (5000 ft) - 1530 m (5020 ft)
             Andrew Jenkins 427 93/2009
             """)),
         [Trait(col_name='Andrew Jenkins', col_no='427', start=56, end=74)])
Esempio n. 15
0
 def test_parse_01(self):
     """It gets a taxon notation."""
     self.assertEqual(
         PLANT_TAXON.parse(
             textwrap.dedent("""
             Cornaceae
             Cornus obliqua Raf.
             Washington County""")),
         [Trait(value='Cornus obliqua Raf.', start=11, end=30)])
 def test_parse_03(self):
     """It adjusts future dates back a century."""
     tomorrow = date.today() + relativedelta(days=1)
     tomorrow = tomorrow.strftime('%d %b %y')
     expect = date.today() + relativedelta(years=-100, days=1)
     expect = expect.strftime('%Y-%m-%d')
     self.assertEqual(
         LABEL_DATE.parse(tomorrow),
         [Trait(value=expect, century_adjust=True, start=0, end=9)])
Esempio n. 17
0
 def test_parse_02(self):
     """It gets a family notation."""
     self.assertEqual(
         PLANT_FAMILY.parse(
             textwrap.dedent("""
             Crowley's Ridge
             Fabaceae
             Vicia villosa Roth ssp. varia (Host) Corb.
             CLAY COUNTY
             """)), [Trait(value='Fabaceae', start=17, end=25)])
Esempio n. 18
0
 def test_parse_05(self):
     """It handles initials differently."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             x
             e
             Coll. Marie P. Locke No. 5595
             """)),
         [Trait(col_name='Marie P. Locke', col_no='5595', start=5, end=34)])
Esempio n. 19
0
    def test_parse_14(self):
        """It handles newlines between collector and collector number."""
        self.assertEqual(
            COLLECTOR.parse(
                textwrap.dedent("""
                Coll. Marie P. Locke

                No. 2319
                Date September 4, 1977
                """)),
            [Trait(col_name='Marie P. Locke', col_no='2319', start=1, end=31)])
Esempio n. 20
0
def convert(token):
    """Normalize a parsed date"""
    trait = Trait(start=token.start, end=token.end)

    if token.group.get('us_county'):
        trait.us_county = token.group['us_county'].title()

    if token.group.get('us_state'):
        trait.us_state = us_states.normalize_state(token.group['us_state'])

    return trait
Esempio n. 21
0
 def test_parse_06(self):
     """It handles random words matching names."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             Woodsia obtusa (Sprengel) Torrey
             Dry hardwood slope 3 miles south of
             Isothermal Community College.
             Altitude 960 ft.
             Date 6/9/75
             Collected by _Wayne.. Hutchins.
             """)), [Trait(col_name='Hutchins', start=129, end=159)])
Esempio n. 22
0
 def test_parse_08(self):
     """It handles more random words matching names."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             PRINGLE HERBARIUM
             DEPT. OF BOTANY @ UNIVERSITY OF VERMONT
             Campyloneuron repens (Auble.) C.Pres]
             Costa Rica, Cartago Prov. Valle de La Estrella. Road
             between Estrella and Vara de Roble. 1600m. Steep canyon
             with stream on east flowing into Rio Empalme,
             ex Herb Hugo Churchill
             3466
             Hugo Churchill Feb, 5, 1980
             """)), [Trait(col_name='Hugo Churchill', start=280, end=294)])
Esempio n. 23
0
 def test_parse_03(self):
     """It does not parse other fields."""
     self.assertEqual(
         COLLECTOR.parse(
             textwrap.dedent("""
             Rhus glabra L. "Smooth Sumac"
             Woodruff Co., Arkansas
             Vicinity of bridge on Hwy 33, ca. 2 mi. S. of the
             town of Gregory; S19, T6N; R3W.
             Det, Edwin B. Smith
             Coll. Marie P. Locke No. 5595
             Date June 29, 1985
             """)), [
                 Trait(col_name='Marie P. Locke',
                       col_no='5595',
                       start=156,
                       end=185)
             ])
Esempio n. 24
0
def convert(token):
    """Normalize a parsed date."""
    trait = Trait(start=token.start, end=token.end)

    value = regex.sub(r'[^a-z\d]+',
                      '-',
                      token.group['value'],
                      flags=regex.I | regex.X)

    if len(value) < 4:
        return None

    try:
        trait.value = parser.parse(value).date()
    except (parser.ParserError, IllegalMonthError):
        return None

    if trait.value > date.today():
        trait.value -= relativedelta(years=100)
        trait.century_adjust = True

    trait.value = trait.value.isoformat()[:10]
    return trait
 def test_parse_13(self):
     """It find the correct parses."""
     self.assertEqual(
         ADMIN_UNIT.parse("""Cape May, New Jersey"""),
         [Trait(us_state='New Jersey', us_county='Cape May',
                start=0, end=20)])
 def test_parse_12(self):
     """It parses multiword counties and states."""
     self.assertEqual(
         ADMIN_UNIT.parse("""Cape May, New Jersey"""),
         [Trait(us_state='New Jersey', us_county='Cape May',
                start=0, end=20)])
 def test_parse_10(self):
     """It gets a multi word state notation."""
     self.assertEqual(
         ADMIN_UNIT.parse("""PLANTS OF NORTH CAROLINA"""),
         [Trait(us_state='North Carolina', start=0, end=24)])
 def test_parse_09(self):
     """It gets a state notation."""
     self.assertEqual(
         ADMIN_UNIT.parse("""PLANTS OF ARKANSAS"""),
         [Trait(us_state='Arkansas', start=0, end=18)])
 def test_parse_08(self):
     """It picks up common OCR errors."""
     self.assertEqual(
         ADMIN_UNIT.parse("""Caldwell Councy"""),
         [Trait(us_county='Caldwell', start=0, end=15)])
 def test_parse_06(self):
     """It normalizes state abbreviations."""
     self.assertEqual(
         ADMIN_UNIT.parse('Desha Co., Ark.'),
         [Trait(us_county='Desha', us_state='Arkansas', start=0, end=14)])