コード例 #1
0
"""Parse ovaries size notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

OVARY_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper("value", " cross | number len_units? "),
        # E.g.: active, Or: immature
        VOCAB.grouper("state",
                      "active mature destroyed visible developed".split()),
        # Male or female ambiguous, like: gonadLength1
        VOCAB.grouper(
            "ambiguous",
            """
コード例 #2
0
ファイル: sex.py プロジェクト: rafelafrance/traiter_vertnet
"""Parse sex notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

SEX = Base(
    name=__name__.split(".")[-1],
    rules=[
        # JSON keys for sex
        VOCAB.term("sex_key", "sex"),
        # The sexes
        VOCAB.term("sex_vocab", "females? males?".split()),
        # These are words that indicate that "sex" is not a key
        VOCAB.term("not_sex", "and is was".split()),
        # Allow arbitrary words in some cases
        VOCAB.part("word", r' \b [a-z] [^;,"=:\s]* '),
        # Some patterns need a terminator
        VOCAB.part("separator", ' [;,"] | $ '),
        # E.g.: sex might be female;
        VOCAB.producer(
            convert,
            """ sex_key (?P<value> ( sex_vocab | word ){1,2} quest? ) separator """,
        ),
        # E.g.: sex=female?, Or: sex=unknown
        VOCAB.producer(convert,
                       " sex_key (?P<value> ( sex_vocab | word ) quest? ) "),
        # E.g.: male, Or: male?
コード例 #3
0
"""Parse nipple state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="enlarged" if token.group.get("pos") else "not enlarged",
        start=token.start,
        end=token.end,
    )
    return trait


NIPPLES_ENLARGED = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["conj"],
        VOCAB.part("separator", r' [;"?/,] '),
        VOCAB.term("enlarged_abbrev", r"[oc]e[ln]"),
        VOCAB.term("not_enlarged_abbrev", r"[oc]s[ln]"),
        VOCAB.term("false", """ false """),

        VOCAB.producer(convert, """ (?P<pos> nipple enlarged ) """),
コード例 #4
0
"""Parse testes state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(value=token.group["value"].lower(),
                  start=token.start,
                  end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev", "ns sc".split()),
        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(
            convert,
            """ (?P<value>
                ( testes | testes_abbrev ) non? ( scrotal | scrotal_abbrev ) )
コード例 #5
0
"""Parse v****a state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

VAGINA_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part("v****a", r""" (?<! sal ) ( v****a | vag | vulva ) """),
        VOCAB.term("abbrev", r""" ov cv [oc][sme][ln] vc vo """.split()),
        VOCAB.part(
            "closed",
            r"""
                closed | imperforated | imperf | cerrada | non [-\s] perforated
                | unperforate | non  [-\s] perf | clsd | imp
            """,
        ),
        VOCAB.part("open", r""" open | perforated? | perf | abrir """),
        VOCAB.part("other", r""" swollen | plugged | plug | sealed """),
        VOCAB.grouper("state", """ closed | open | other """),
        VOCAB.producer(convert, """ (?P<value> v****a partially? state ) """),
        VOCAB.producer(convert, """ (?P<value> state v****a state? ) """),
        VOCAB.producer(convert,
                       """ (?P<value> ( state | abbrev )  v****a? ) """),
    ],
)
コード例 #6
0
"""Parse ovaries state notations."""

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    value = token.group["value"].lower()
    if regex.match(r"^[\s\d]+$", value):
        return None
    trait = Trait(value=value, start=token.start, end=token.end)
    trait.is_flag_in_token(token, "ambiguous_key")
    trait.is_value_in_token(token, "side")
    return trait


def double(token):
    """Convert a single token into two traits."""
    trait1 = Trait(
        value=token.group["value"][0].lower(),
        side=token.group["side"][0].lower(),
        start=token.start,
        end=token.end,
    )
コード例 #7
0
"""Parse testes size notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.reproductive import convert, double

VOCAB = Vocabulary(patterns.VOCAB)

TESTES_SIZE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Note: abbrev differs from the one in the testes_state_trait
        VOCAB.term("abbrev", "tes ts tnd td tns ta".split()),
        # The abbreviation key, just: t. This can be a problem.
        VOCAB.part("char_key", r" \b t (?! [a-z] )"),
        # A key with units, like: gonadLengthInMM
        VOCAB.term(
            "key_with_units",
            r"""
                (?P<ambiguous_key> gonad ) \s*
                    (?P<dim> length | len | width ) \s* in \s*
                    (?P<len_units> millimeters | mm )
            """,
        ),
        VOCAB.grouper(
            "value",
            """ cross | number len_units? (?! mass_units ) """,
        ),
コード例 #8
0
from functools import partial

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import (
    fraction,
    numeric_fix_ups,
    shorthand_length,
    simple_len,
)
from vertnet.pylib.util import FLAGS

VOCAB = Vocabulary(patterns.VOCAB)

# How far to look into the surrounding context to disambiguate the parse
LOOK_BACK_FAR = 40
LOOK_BACK_NEAR = 10

# These indicate that the parse is not really for an ear length
IS_ET = regex.compile(r" e \.? t ", FLAGS)
IS_NUMBER = regex.compile(" [#] ", FLAGS)
IS_MAG = regex.compile(" magnemite ", FLAGS)
IS_ID = regex.compile(" identifier | ident | id ", FLAGS)

# The 'E' abbreviation gets confused with abbreviation for East sometimes.
# Try to disambiguate the two by looking for a North near by.
LOOK_AROUND = 10
IS_EAST = regex.compile(r" \b n ", FLAGS)
コード例 #9
0
"""Parse date notations."""

from calendar import IllegalMonthError
from datetime import date

import regex
from dateutil import parser
from dateutil.relativedelta import relativedelta
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import patterns
from digi_leap.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)

YEAR_LEN = 2


def convert(token):
    """Normalize a parsed date."""
    trait = Trait(start=token.start, end=token.end)

    value = regex.sub(r'[^a-z\d]+',
                      '-',
                      token.group['value'],
                      flags=regex.I | regex.X)

    if len(value) < 4:
        return None
コード例 #10
0
"""Patterns for US states."""

import pandas as pd
import regex
from traiter.old.vocabulary import Vocabulary

from digi_leap.pylib import const, patterns

STATE_CSV = const.DATA_DIR / 'US_states.csv'
STATES = {}
STATE_NAMES = []
NORMALIZE_US_STATE = {}

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term(
    'USA', r"""
    U\.?S\.?A\.? | U\.?S\.?
    | United \s? States \s? of \s? America | United \s? States
    | U\.? \s? of \s? A\.?""")


def normalize_key(state: str) -> str:
    """Convert state abbreviations into a consistent key."""
    return regex.sub(r'[^a-z]+', '', state.lower())


def normalize_state(state: str) -> str:
    """Convert state abbreviations to the state name."""
    return NORMALIZE_US_STATE.get(normalize_key(state), state.title())
コード例 #11
0
"""Parse hind foot length notations."""

from functools import partial

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple

VOCAB = Vocabulary(patterns.VOCAB)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


HIND_FOOT_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: HindFootLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( hind \s* )? foot \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm )
            """,
        ),
        # Standard keywords that indicate a hind foot length follows
コード例 #12
0
"""Find collector notations on herbarium specimen labels."""

from itertools import zip_longest

import regex
from traiter.old.vocabulary import LOWEST, Vocabulary
from traiter.util import squash

from digi_leap.parsers import name_parts
from digi_leap.parsers.base import Base
from digi_leap.parsers.us_states import STATE_NAMES
from digi_leap.pylib.trait import Trait

VOCAB = Vocabulary(name_parts.VOCAB)

MIN_LEN = 5  # Minimum collector name length


def convert(token):
    """Build a collector trait"""
    names = regex.split(r'\s*(?:and|with|[,&])\s*',
                        token.group.get('col_name'))

    traits = []

    for name, suffix in zip_longest(names, names[1:], fillvalue=''):
        name = regex.sub(r'\.{3,}.*', '', name)
        if len(name) < MIN_LEN:
            continue

        trait = Trait(start=token.start, end=token.end)
コード例 #13
0
"""Parse total length notations."""

from functools import partial

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.numeric as numeric
import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import compound, fix_up_inches, fraction
from vertnet.pylib.util import FLAGS

VOCAB = Vocabulary(patterns.VOCAB)

# How far to look into the surrounding context to disambiguate the parse
LOOK_BACK_FAR = 40
LOOK_BACK_NEAR = 10

# These indicate that the parse is not a total length
IS_ID = regex.compile(" identifier | ident | id | collector ", FLAGS)
IS_TRAP = regex.compile(" trap ", FLAGS)
IS_TESTES = regex.compile(
    " reproductive | gonad | test | scrotal | scrotum | scrot ", FLAGS)

# The 'L' abbreviation gets confused with abbreviation for Left sometimes.
# Try to disambiguate the two by looking for a Right near by.
LOOK_AROUND = 10
IS_LEFT = regex.compile(r" \b r \b ", FLAGS)

コード例 #14
0
"""Parse tragus length notations."""

from functools import partial

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple

VOCAB = Vocabulary(patterns.VOCAB)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


TRAGUS_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: tragusLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( tragus \s* ) \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm ) """,
        ),
        # Standard keywords that indicate a tragus length follows
        VOCAB.term(
コード例 #15
0
"""Parse body mass notations."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list, squash, to_positive_float

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.convert_units import convert_units
from vertnet.pylib.numeric import add_flags, as_value, simple_mass
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def shorthand(token):
    """Convert a shorthand value like 11-22-33-44:55g."""
    trait = Trait(start=token.start, end=token.end)
    flag = as_value(token, trait, "shorthand_wt", "shorthand_wt_units")
    trait.is_flag_in_token(token, "estimated_wt", rename="estimated_value")
    trait.is_shorthand = True
    return trait if flag else None


def compound(token):
    """Convert a compound weight like: 2 lbs. 3.1 - 4.5 oz."""
    trait = Trait(start=token.start, end=token.end)
    trait.units = [token.group["pounds"], token.group["ounces"]]
    trait.units_inferred = False
    trait.is_flag_missing(token, "key", rename="ambiguous_key")
    lbs = convert_units(to_positive_float(token.group["lbs"]), "lbs")
    ozs = [
コード例 #16
0
"""Parse lactation state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base, convert

VOCAB = Vocabulary(patterns.VOCAB)

LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
                | nursing | suckling
                ) \b
            """,
        ),
        VOCAB.part("not", r" \b ( not | non | no ) "),
        VOCAB.part(
            "post",
            r""" \b (
                (( just | recently ) \s+ )? finished
                | post | recently | recent | had | pre
            ) """,
        ),
コード例 #17
0
"""Parse placental scar counts."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait
from vertnet.pylib.util import to_positive_int

VOCAB = Vocabulary(patterns.VOCAB)

SUB = {"l": "left", "r": "right", "m": "male", "f": "female"}


def convert_count(token):
    """Convert parsed tokens into a result."""
    trait = Trait(start=token.start, end=token.end)

    trait.value = to_positive_int(token.group.get("value"))
    count1 = to_positive_int(token.group.get("count1"))
    count2 = to_positive_int(token.group.get("count2"))
    side1 = SUB.get(token.group.get("side1", " ").lower()[0], "side1")
    side2 = SUB.get(token.group.get("side2", " ").lower()[0], "side2")

    if not trait.value:
        trait.value = count1 + count2

    if count1 or side1 != "side1":
        setattr(trait, side1, count1)
コード例 #18
0
"""Find taxon notations on herbarium specimen labels."""

import pandas as pd
from traiter.old.vocabulary import LOWEST, Vocabulary

from digi_leap.parsers.base import Base
from digi_leap.pylib import const, patterns
from digi_leap.pylib.trait import Trait

PLANT_FAMILIES = const.DATA_DIR / 'itis_plant_families.csv'
PLANT_GENERA = const.DATA_DIR / 'itis_plant_genera.csv'

VOCAB = Vocabulary(patterns.VOCAB)
VOCAB.part('word', r' \S+ ', capture=False, priority=LOWEST)

DATA = pd.read_csv(PLANT_FAMILIES, na_filter=False, dtype=str)
VOCAB.term('plant_family', DATA['complete_name'].tolist())

DATA = pd.read_csv(PLANT_GENERA, na_filter=False, dtype=str)
VOCAB.term('plant_genus', DATA['complete_name'].tolist())


def convert(token):
    """Normalize a parsed taxon notation"""
    return Trait(start=token.start, end=token.end, value=token.group['value'])


PLANT_TAXON = Base(name='plant_taxon',
                   rules=[
                       VOCAB['eol'],
                       VOCAB.producer(convert,
コード例 #19
0
"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part('slash', r' [/] ', capture=False)
VOCAB.part('dash', r' (?: – | - ) ', capture=False)
VOCAB.part('open', r' [(\[] ', capture=False)
VOCAB.part('close', r' [)\]] ', capture=False)
VOCAB.part('x', r' [x×] ', capture=False)
VOCAB.part('quest', r' [?] ')
VOCAB.part('comma', r' [,] ', capture=False, priority=LOWEST)
VOCAB.part('semicolon', r' [;] ', capture=False, priority=LOWEST)
VOCAB.part('ampersand', r' [&] ', capture=False)
VOCAB.part('eq', r' [=] ', capture=False)
VOCAB.part('under', r' [_] ', capture=False)
VOCAB.part('eol', r' [\n\r\f] ', capture=False)
VOCAB.part('dot', r' [.] ', capture=False)

# Small words
VOCAB.part('by', r' by ', capture=False)
VOCAB.part('to', r' to ', capture=False)
VOCAB.part('with', r' with ', capture=False)
VOCAB.part('up_to', r' ( up \s+ )? to ', capture=False)
VOCAB.term('and', r' and ', capture=False)
VOCAB.term('conj', ' or and '.split(), capture=False)
VOCAB.term('prep', ' to with on of '.split(), capture=False)

VOCAB.term('word', r' [a-z] \w* ', capture=False, priority=LOWEST)
コード例 #20
0
"""Parse embryo counts."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait
from vertnet.pylib.util import to_positive_int

VOCAB = Vocabulary(patterns.VOCAB)

SUB = {"l": "left", "r": "right", "m": "male", "f": "female"}


def convert(token):
    """Convert parsed tokens into a result."""
    trait = Trait(start=token.start, end=token.end)

    if token.group.get("total"):
        trait.value = to_positive_int(token.group["total"])

    if token.group.get("subcount"):
        trait.value = sum(
            to_positive_int(c) for c in as_list(token.group["subcount"]))

    if token.group.get("subcount") and token.group.get("sub"):
        for count, sub in zip(as_list(token.group["subcount"]),
                              as_list(token.group.get("sub"))):
            count = "1" if count == "!" else count
            sub = SUB.get(sub[0].lower(), sub)
コード例 #21
0
"""Parse lactation state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait."""
    trait = Trait(
        value="lactating" if token.group.get("pos") else "not lactating",
        start=token.start,
        end=token.end,
    )
    return trait


LACTATION_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.part(
            "lactating",
            r""" (
                lactating | lactation | lactated | lactate | lact
                | lactaing | lactacting | lactataing | lactational
                | oelact | celact | lactati | lactacting | lactatin
                | lactatting | lactatng
コード例 #22
0
"""Patterns for names."""

import pandas as pd
from traiter.old.vocabulary import Vocabulary

from digi_leap.pylib import patterns
from digi_leap.pylib.const import DATA_DIR

NAME_CSV = DATA_DIR / 'name_parts.csv'

SUFFIXES = 'filho ii iii jr sr'.split()

VOCAB = Vocabulary(patterns.VOCAB)


def build_name_parts():
    """Build name patterns."""
    df = pd.read_csv(NAME_CSV, na_filter=False, dtype=str)
    VOCAB.term('name_part', df['name'].tolist(), capture=False)


build_name_parts()

VOCAB.term('suffix', SUFFIXES)
VOCAB.term('initial', r'[[:alpha:]] (?! \s* \d+ )')
コード例 #23
0
"""Patterns for US counties."""

from collections import defaultdict

import pandas as pd
from traiter.old.vocabulary import Vocabulary

from digi_leap.parsers import us_states
from digi_leap.pylib import const

COUNTY_CSV = const.DATA_DIR / 'US_counties.csv'

VOCAB = Vocabulary(us_states.VOCAB)


def build_counties():
    """Read the CSV file and build counties by state."""
    counties = defaultdict(list)
    df = pd.read_csv(COUNTY_CSV, na_filter=False, dtype=str)
    for _, row in df.iterrows():
        counties[row['State']].append(row['County'])

    us_county = []
    for abbrev in us_states.STATES.values():
        names = []
        for name in [n for n in counties[abbrev] if n not in us_states.STATES]:
            name = name.replace('.', r'\.?')
            name = name.replace("'", "'?")
            name = name.replace(' ', r'\s?')
            name = name.replace('-', r'[\s-]?')
            names.append(name)
コード例 #24
0
"""Parse embryo lengths."""

from traiter.old.vocabulary import Vocabulary
from traiter.util import as_list, to_positive_float

import vertnet.pylib.convert_units as convert_units
import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import add_flags, fix_up_inches, simple
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)

TOO_BIG = 1000


def convert(token):
    """Convert parsed token into a trait product."""
    trait = simple(token, units="len_units")
    return trait if all(x < TOO_BIG for x in as_list(trait.value)) else None


def isolate(token):
    """Convert parsed token into a trait product."""
    token.group["number"] = [
        v.strip() for v in token.group["value"].split("x")
    ]
    return convert(token)


def convert_many(token):
コード例 #25
0
"""Shared reproductive trait tokens (testes & ovaries)."""

from traiter.old.vocabulary import LOWEST, Vocabulary

import vertnet.pylib.patterns as patterns

VOCAB = Vocabulary(patterns.VOCAB)

VOCAB.term("sex", "females? | males? | [f]")

VOCAB.term("active", "active inactive".split())
VOCAB.part("and", r" ( and \b | [&] ) ")
VOCAB.term("count", r"""( only | all | both )? \s* [12]""")

VOCAB.term(
    "color",
    r""" (( dark | light | pale ) \s* )?
         ( red | pink | brown | black | white | pigmented )
    """,
)

VOCAB.term("texture", " smooth ")

VOCAB.term("covered", " covered ")

VOCAB.term("destroyed", "destroy(ed)?")

VOCAB.part(
    "size",
    r"""
        ( very \s+ )?
コード例 #26
0
"""Parse forearm length notations."""

from functools import partial

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple

VOCAB = Vocabulary(patterns.VOCAB)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Try to disambiguate doubles quotes from inches
    return fix_up_inches(trait, text)


FOREARM_LENGTH = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB["uuid"],  # UUIDs cause problems with numbers
        # Units are in the key, like: ForearmLengthInMillimeters
        VOCAB.term(
            "key_with_units",
            r"""( forearm \s* )? \s* ( length | len ) \s* in \s*
                    (?P<units> millimeters | mm )
            """,
        ),
        # Standard keywords that indicate a forearm length follows
コード例 #27
0
"""Shared token patterns."""

from traiter.old.vocabulary import FIRST, LOWEST, Vocabulary

from vertnet.pylib.util import NUM_WORDS, ORDINALS

VOCAB = Vocabulary()

# Chars that may be a token
VOCAB.part("slash", r" [/] ", capture=False)
VOCAB.part("dash", r" \p{Pd} ", capture=False)
VOCAB.part("open", r" \p{Ps} ", capture=False)
VOCAB.part("close", r" \p{Pe} ", capture=False)
VOCAB.part("x", r" [x×] ", capture=False)
VOCAB.part("quest", r" [?] ")
VOCAB.part("comma", r" [,] ", capture=False, priority=LOWEST)
VOCAB.part("semicolon", r" [;] ", capture=False, priority=LOWEST)
VOCAB.part("colon", r" [:] ", capture=False, priority=LOWEST)
VOCAB.part("ampersand", r" [&] ", capture=False)
VOCAB.part("eq", r" [=] ", capture=False)
VOCAB.part("plus", r" [+] ", capture=False)
VOCAB.part("under", r" [_] ", capture=False)
VOCAB.part("eol", r" [\n\r\f] ", capture=False)
VOCAB.part("dot", r" [.] ", capture=False)

# Small words
VOCAB.part("by", r" by ", capture=False)
VOCAB.part("to", r" to ", capture=False)
VOCAB.part("with", r" with ", capture=False)
VOCAB.part("up_to", r" ( up \s+ )? to ", capture=False)
VOCAB.term("and", r" and ", capture=False)
コード例 #28
0
"""Parse lactation state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait
from vertnet.pylib.util import to_positive_int

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert single value tokens into a result."""
    value = token.group.get("value")

    if not value:
        return None

    trait = Trait(start=token.start, end=token.end)
    trait.value = to_positive_int(value)

    if trait.value > 100:
        return None

    if token.group.get("notation"):
        trait.notation = token.group["notation"]

    return trait

コード例 #29
0
"""Parse tail length notations."""

from functools import partial

import regex
from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.numeric import fix_up_inches, fraction, shorthand_length, simple
from vertnet.pylib.util import FLAGS

VOCAB = Vocabulary(patterns.VOCAB)

# How far to look into the surrounding context to disambiguate the parse
LOOK_BACK_FAR = 40
LOOK_BACK_NEAR = 20

# These indicate that the parse is not really for a tail length
IS_TESTES = regex.compile(
    " reproductive | gonad | test | scrotal | scrotum | scrot ", FLAGS)
IS_ELEVATION = regex.compile(" elevation | elev ", FLAGS)
IS_TOTAL = regex.compile(" body | nose | snout ", FLAGS)
IS_TAG = regex.compile(" tag ", FLAGS)
IS_ID = regex.compile(" identifier | ident | id ", FLAGS)


def fix_up(trait, text):
    """Fix problematic parses."""
    # Check that this isn't a total length trait
    start = max(0, trait.start - LOOK_BACK_NEAR)
コード例 #30
0
"""Parse testes state notations."""

from traiter.old.vocabulary import Vocabulary

import vertnet.pylib.shared_reproductive_patterns as patterns
from vertnet.parsers.base import Base
from vertnet.pylib.trait import Trait

VOCAB = Vocabulary(patterns.VOCAB)


def convert(token):
    """Convert parsed token into a trait producer."""
    trait = Trait(
        value="scrotal" if token.group.get("pos") else "not scrotal",
        start=token.start,
        end=token.end,
    )
    return trait


SCROTAL_STATE = Base(
    name=__name__.split(".")[-1],
    rules=[
        VOCAB.term("testes_abbrev", "tes ts tnd td tns ta t".split()),
        VOCAB.term("scrotal_abbrev_pos", "sc".split()),
        VOCAB.term("scrotal_abbrev_neg", "ns ".split()),

        # If possible exclude length. Ex: reproductive data=testes: 11x7 mm
        VOCAB.grouper("length", "cross len_units?"),
        VOCAB.producer(convert, """ (?P<pos> scrotal_pos ) """),