def test_parse_18(self): """It handles a run-on with the label.""" self.assertEqual( COLLECTOR.parse("""Sarah Nunn and Laura Eason 9834"""), [ Trait(col_name='Sarah Nunn', col_no='9834', start=0, end=31), Trait(col_name='Laura Eason', start=0, end=31) ])
def test_parse_15(self): """It parses collectors separated by 'with'.""" self.assertEqual( COLLECTOR.parse( 'Sarah Nunn with Angela Brown 7529 20 October 2002 of'), [ Trait(col_name='Sarah Nunn', col_no='7529', start=0, end=33), Trait(col_name='Angela Brown', start=0, end=33) ])
def test_parse_02(self): """It parses a several collectors.""" self.assertEqual( COLLECTOR.parse('Sarah Nunn and S. Jacobs and R. Mc Elderry 9480'), [ Trait(col_name='Sarah Nunn', col_no='9480', start=0, end=47), Trait(col_name='S. Jacobs', start=0, end=47), Trait(col_name='R. Mc Elderry', start=0, end=47), ])
def test_parse_11(self): """It gets a state notation separated from the county.""" self.assertEqual( ADMIN_UNIT.parse(textwrap.dedent(""" APPALACHIAN STATE UNIVERSITY HERBARIUM PLANTS OF NORTH CAROLINA STONE MOUNTAIN STATE PARK WILKES COUNTY """)), [Trait(us_state='North Carolina', start=40, end=64), Trait(us_county='Wilkes', start=91, end=104)])
def test_parse_16(self): """It parses collectors separated by 'with'.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" Collector: Christopher Reid & Sarah Nunn No.: 2018 Date: 16 May 2001 """)), [ Trait(col_name='Christopher Reid', col_no='2018', start=1, end=51), Trait(col_name='Sarah Nunn', start=1, end=51) ])
def test_parse_11(self): """It handles no spaces between the date parts.""" self.assertEqual( LABEL_DATE.parse(textwrap.dedent(""" Slender erect shrubs 4-5 m.tall. June 7, 1923 xo, 23163""")), [Trait(value='1923-06-07', start=34, end=46)])
def convert(token): """Build a collector trait""" names = regex.split(r'\s*(?:and|with|[,&])\s*', token.group.get('col_name')) traits = [] for name, suffix in zip_longest(names, names[1:], fillvalue=''): name = regex.sub(r'\.{3,}.*', '', name) if len(name) < MIN_LEN: continue trait = Trait(start=token.start, end=token.end) trait.col_name = name if suffix.lower() in name_parts.SUFFIXES: trait.col_name = f'{name} {suffix}' if name.lower() not in name_parts.SUFFIXES: traits.append(trait) if not traits: return None if token.group.get('collector_no'): col_no = token.group['collector_no'] # Temp hack if col_no[-1] in ('m', 'M'): return None traits[0].col_no = col_no return squash(traits)
def test_parse_15(self): """It handles a eol between the state label and state.""" self.assertEqual( ADMIN_UNIT.parse(textwrap.dedent(""" PLANTS OF North Carolina """)), [Trait(us_state='North Carolina', start=1, end=25)])
def test_parse_03(self): """It gets the full notation.""" self.assertEqual( PLANT_TAXON.parse("""Cephalanthus occidentalis L. Rubiaceas"""), [ Trait(value='Cephalanthus occidentalis L. Rubiaceas', start=0, end=38) ])
def test_parse_04(self): """It handles a bad name.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" APPALACHIAN STATE UNIVERSITY HERBARIUM PLANTS OF NORTH CAROLINA Collected by _Wayne.. Hutchins. """)), [Trait(col_name='Hutchins', start=65, end=95)])
def test_parse_07(self): """It works with noisy text.""" self.assertEqual( ADMIN_UNIT.parse(textwrap.dedent(""" Cornus drummondii C. A. Mey. Hempstead County Grandview Prairie; on CR 35, 10 air miles S/SE of Nashville; in """)), [Trait(us_county='Hempstead', start=30, end=46)])
def test_parse_10(self): """It handles no spaces between the date parts.""" self.assertEqual( LABEL_DATE.parse(textwrap.dedent(""" Altitude 1000 ft. Date 8/20/75 Collected by ...Wayne. Hutchins No. """)), [Trait(value='1975-08-20', start=19, end=31)])
def test_parse_13(self): """It parses multi-part collector numbers.""" self.assertEqual( COLLECTOR.parse('Coll. Stephen W. Bailey No, SWBII 1)'), [ Trait(col_name='Stephen W. Bailey', col_no='SWBII 1', start=0, end=35) ])
def test_parse_10(self): """It handles more random words matching names.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" WGS84. Flevation: 1,524 m (5000 ft) - 1530 m (5020 ft) Andrew Jenkins 427 93/2009 """)), [Trait(col_name='Andrew Jenkins', col_no='427', start=56, end=74)])
def test_parse_01(self): """It gets a taxon notation.""" self.assertEqual( PLANT_TAXON.parse( textwrap.dedent(""" Cornaceae Cornus obliqua Raf. Washington County""")), [Trait(value='Cornus obliqua Raf.', start=11, end=30)])
def test_parse_03(self): """It adjusts future dates back a century.""" tomorrow = date.today() + relativedelta(days=1) tomorrow = tomorrow.strftime('%d %b %y') expect = date.today() + relativedelta(years=-100, days=1) expect = expect.strftime('%Y-%m-%d') self.assertEqual( LABEL_DATE.parse(tomorrow), [Trait(value=expect, century_adjust=True, start=0, end=9)])
def test_parse_02(self): """It gets a family notation.""" self.assertEqual( PLANT_FAMILY.parse( textwrap.dedent(""" Crowley's Ridge Fabaceae Vicia villosa Roth ssp. varia (Host) Corb. CLAY COUNTY """)), [Trait(value='Fabaceae', start=17, end=25)])
def test_parse_05(self): """It handles initials differently.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" x e Coll. Marie P. Locke No. 5595 """)), [Trait(col_name='Marie P. Locke', col_no='5595', start=5, end=34)])
def test_parse_14(self): """It handles newlines between collector and collector number.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" Coll. Marie P. Locke No. 2319 Date September 4, 1977 """)), [Trait(col_name='Marie P. Locke', col_no='2319', start=1, end=31)])
def convert(token): """Normalize a parsed date""" trait = Trait(start=token.start, end=token.end) if token.group.get('us_county'): trait.us_county = token.group['us_county'].title() if token.group.get('us_state'): trait.us_state = us_states.normalize_state(token.group['us_state']) return trait
def test_parse_06(self): """It handles random words matching names.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" Woodsia obtusa (Sprengel) Torrey Dry hardwood slope 3 miles south of Isothermal Community College. Altitude 960 ft. Date 6/9/75 Collected by _Wayne.. Hutchins. """)), [Trait(col_name='Hutchins', start=129, end=159)])
def test_parse_08(self): """It handles more random words matching names.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" PRINGLE HERBARIUM DEPT. OF BOTANY @ UNIVERSITY OF VERMONT Campyloneuron repens (Auble.) C.Pres] Costa Rica, Cartago Prov. Valle de La Estrella. Road between Estrella and Vara de Roble. 1600m. Steep canyon with stream on east flowing into Rio Empalme, ex Herb Hugo Churchill 3466 Hugo Churchill Feb, 5, 1980 """)), [Trait(col_name='Hugo Churchill', start=280, end=294)])
def test_parse_03(self): """It does not parse other fields.""" self.assertEqual( COLLECTOR.parse( textwrap.dedent(""" Rhus glabra L. "Smooth Sumac" Woodruff Co., Arkansas Vicinity of bridge on Hwy 33, ca. 2 mi. S. of the town of Gregory; S19, T6N; R3W. Det, Edwin B. Smith Coll. Marie P. Locke No. 5595 Date June 29, 1985 """)), [ Trait(col_name='Marie P. Locke', col_no='5595', start=156, end=185) ])
def convert(token): """Normalize a parsed date.""" trait = Trait(start=token.start, end=token.end) value = regex.sub(r'[^a-z\d]+', '-', token.group['value'], flags=regex.I | regex.X) if len(value) < 4: return None try: trait.value = parser.parse(value).date() except (parser.ParserError, IllegalMonthError): return None if trait.value > date.today(): trait.value -= relativedelta(years=100) trait.century_adjust = True trait.value = trait.value.isoformat()[:10] return trait
def test_parse_13(self): """It find the correct parses.""" self.assertEqual( ADMIN_UNIT.parse("""Cape May, New Jersey"""), [Trait(us_state='New Jersey', us_county='Cape May', start=0, end=20)])
def test_parse_12(self): """It parses multiword counties and states.""" self.assertEqual( ADMIN_UNIT.parse("""Cape May, New Jersey"""), [Trait(us_state='New Jersey', us_county='Cape May', start=0, end=20)])
def test_parse_10(self): """It gets a multi word state notation.""" self.assertEqual( ADMIN_UNIT.parse("""PLANTS OF NORTH CAROLINA"""), [Trait(us_state='North Carolina', start=0, end=24)])
def test_parse_09(self): """It gets a state notation.""" self.assertEqual( ADMIN_UNIT.parse("""PLANTS OF ARKANSAS"""), [Trait(us_state='Arkansas', start=0, end=18)])
def test_parse_08(self): """It picks up common OCR errors.""" self.assertEqual( ADMIN_UNIT.parse("""Caldwell Councy"""), [Trait(us_county='Caldwell', start=0, end=15)])
def test_parse_06(self): """It normalizes state abbreviations.""" self.assertEqual( ADMIN_UNIT.parse('Desha Co., Ark.'), [Trait(us_county='Desha', us_state='Arkansas', start=0, end=14)])