def get_definitions(text,
                    return_sources=False,
                    decode_unicode=True) -> Generator:
    """
    Find possible definitions in natural language.
    :param decode_unicode:
    :param return_sources:
    :param text:
    :return:
    """

    for sentence in get_sentence_list(text):
        result = set()

        if decode_unicode:
            sentence = unidecode.unidecode(sentence)

        for item in TRIGGER_WORDS_PTN_RE.findall(sentence):
            result.update(EXTRACT_PTN_RE.findall(item))

        # case #2
        result.update(PAREN_PTN_RE.findall(sentence))

        for term in result:
            if len(get_token_list(term)) <= MAX_TERM_TOKENS:
                if return_sources:
                    yield (term, sentence)
                else:
                    yield term
Example #2
0
def get_copyright(text, return_sources=False) -> Generator:
    """
    Find copyright in text.
    :param text:
    :param return_sources:
    :return:
    """
    # Iterate through sentences
    if COPYRIGHT_PTN_RE.search(text):
        for sentence in get_sentence_list(text):
            for phrase in np_extractor.get_np(sentence):
                cps = COPYRIGHT_PTN_RE.findall(phrase)
                for cp_text, cp_sign, cp_date, cp_name in cps:
                    # TODO: catch in the general regex
                    if not cp_date:
                        cp_date_at_end = YEAR_PTN_RE.search(cp_name)
                        if cp_date_at_end:
                            cp_date = cp_date_at_end.group()
                            cp_name = re.sub(r'{}$'.format(cp_date), '', cp_name)
                    ret = (cp_sign.strip(),
                           cp_date.replace(' ', ''),
                           cp_name.strip(string.punctuation + string.whitespace))
                    if return_sources:
                        ret += (cp_text.strip(),)
                    yield ret
Example #3
0
def get_constraints(text, strict=False) -> Generator:
    """
    Find possible constraints in natural language.
    :param text:
    :param strict:
    :return:
    """

    # Iterate through all potential matches
    for sentence in get_sentence_list(text):
        for match in RE_CONSTRAINT.finditer(sentence.lower()):
            # Get individual group matches
            captures = match.capturesdict()
            num_pre = len(captures["pre"])
            num_post = len(captures["post"])

            # Skip if strict and empty pre/post
            if strict and (num_pre + num_post == 0):
                continue

            # Setup fields
            constraint = captures.get("constraint").pop().lower()
            pre = "".join(captures["pre"])
            post = "".join(captures["post"])

            if num_post == 0 and num_pre == 1:
                combined = "{0} {1}".format(pre, constraint).lower().strip()
                if combined in CONSTRAINT_PHRASES:
                    constraint = combined

            # Append
            yield (constraint, pre, post)
Example #4
0
    def process(input_directory, input_file, section_name):
        #Speedup: bash runStanfordParserServer.sh, bash runSSTServer.sh

        import sys
        import pip
        import spacy
        import neuralcoref
        import lexnlp.nlp.en.segments.sentences as lex_sentences
        import question_generator as gen
        import csv
        import time

        #Load
        start_time = time.time()
        with open(input_file, 'r') as file:
            brief = file.read()
        print("--- %s seconds to Load ---" % (time.time() - start_time))

        #Preprocess
        ##start_time = time.time()
        ##brief = lex_sentences.pre_process_document(brief)
        ##print("--- %s seconds to LexNLP Preprocess---" % (time.time() - start_time))

        start_time = time.time()
        pronouns = spacy.load('en_core_web_sm')
        neuralcoref.add_to_pipe(pronouns,greedyness=0.5,max_dist=100,blacklist=False)
        neural = pronouns(brief)
        brief = neural._.coref_resolved
        print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time))

        #Tokenize
        start_time = time.time()
        sentences = list(lex_sentences.get_sentence_list(brief))
        questions = gen.QuestionGenerator()
        print("--- %s seconds to Tokenize ---" % (time.time() - start_time))
def get_condition_annotations(text: str, strict=True) \
        -> Generator[ConditionAnnotation, None, None]:
    """
    Find possible conditions in natural language.
    :param text:
    :param strict:
    :return:
    """

    # Iterate through all potential matches
    for sentence in get_sentence_list(text):
        for match in RE_CONDITION.finditer(sentence):
            # Get individual group matches
            captures = match.capturesdict()
            num_pre = len(captures["pre"])
            num_post = len(captures["post"])

            # Skip if strict and empty pre/post
            if strict and (num_pre == 0 or num_post == 0):
                continue

            ant = ConditionAnnotation(
                coords=match.span(),
                condition=captures["condition"].pop().lower(),
                pre=captures["pre"].pop(),
                post=captures["post"].pop())
            yield ant
Example #6
0
def get_noun_phrases(text,
                     strict=False,
                     return_source=False,
                     window=3,
                     valid_punctuation=None) -> Generator:
    """
    Get NNP phrases from text
    """
    valid_punctuation = valid_punctuation or VALID_PUNCTUATION
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        nnps = []
        last_nnp_pos = None
        for i, chunk in enumerate(sentence_pos):
            do_join = not strict and last_nnp_pos is not None and (
                i - last_nnp_pos) < window
            # Check label
            if chunk[1] in ['NNP', 'NNPS']:
                if do_join:
                    sep = "" if "(" in valid_punctuation and nnps[-1][
                        -1] == "(" else " "
                    nnps[-1] += sep + chunk[0]
                else:
                    nnps.append(chunk[0])
                last_nnp_pos = i
            elif do_join:
                if chunk[1] in ['CC'] or chunk[0] in valid_punctuation:
                    if chunk[0].lower() in ["or"]:
                        continue
                    nnps[-1] += (' ' if chunk[0].lower() in ['&', 'and', '(']
                                 else '') + chunk[0]
                    last_nnp_pos = i
                else:
                    last_nnp_pos = None

        # Clean up names and yield
        for nnp in nnps:
            # Cleanup
            nnp = nnp.strip()
            if len(nnp) <= 2:
                continue

            if nnp.lower().endswith(' and'):
                nnp = nnp[0:-4].strip()
            elif nnp.endswith(' &'):
                nnp = nnp[0:-2].strip()

            nnp = strip_unicode_punctuation(nnp).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield nnp, sentence
            else:
                yield nnp
Example #7
0
def get_geopolitical(text,
                     strict=False,
                     return_source=False,
                     window=2) -> Generator:
    """
    Get GPEs from text
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        gpes = []
        last_gpe_pos = None
        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if isinstance(chunk, nltk.tree.Tree):
                # Check label
                if chunk.label() == 'GPE':
                    if not strict and last_gpe_pos is not None and (
                            i - last_gpe_pos) < window:
                        gpes[-1] += " " + " ".join([c[0] for c in chunk])
                    else:
                        gpes.append(" ".join([c[0] for c in chunk]))
                    last_gpe_pos = i
            elif not strict and last_gpe_pos is not None and (
                    i - last_gpe_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    gpes[-1] += " " + chunk[0]
                    last_gpe_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    gpes[-1] += (" " if chunk[0].lower() in ["&", "and"] else
                                 "") + chunk[0]
                    last_gpe_pos = i
                else:
                    last_gpe_pos = None

        # Clean up names and yield
        for gpe in gpes:
            # Cleanup
            gpe = gpe.strip()
            if len(gpe) <= 2:
                continue

            if gpe.lower().endswith(" and"):
                gpe = gpe[0:-4]
            elif gpe.endswith(" &"):
                gpe = gpe[0:-2]

            gpe = strip_unicode_punctuation(gpe).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield gpe, sentence
            else:
                yield gpe
def get_trademarks(text) -> Generator:
    """
    Find trademarks in text.
    :param text:
    :return:
    """
    # Iterate through sentences
    if TRADEMARK_PTN_RE.search(text):
        for sentence in get_sentence_list(text):
            for phrase in np_extractor.get_np(sentence):
                tms = TRADEMARK_PTN_RE.findall(phrase)
                for tm in tms:
                    yield tm
Example #9
0
def process(input_directory, input_file, section_name):
    #Speedup: bash runStanfordParserServer.sh, bash runSSTServer.sh

    import sys
    import pip
    import spacy
    import neuralcoref
    import lexnlp.nlp.en.segments.sentences as lex_sentences
    import question_generator as gen
    import csv
    import time

    #Load
    start_time = time.time()
    with open(input_file, 'r') as file:
        brief = file.read()
    print("--- %s seconds to Load ---" % (time.time() - start_time))

    #Preprocess
    ##start_time = time.time()
    ##brief = lex_sentences.pre_process_document(brief)
    ##print("--- %s seconds to LexNLP Preprocess---" % (time.time() - start_time))

    start_time = time.time()
    pronouns = spacy.load('en')
    neuralcoref.add_to_pipe(pronouns,
                            greedyness=0.5,
                            max_dist=100,
                            blacklist=False)
    neural = pronouns(brief)
    brief = neural._.coref_resolved
    print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time))

    #Tokenize
    start_time = time.time()
    sentences = list(lex_sentences.get_sentence_list(brief))
    questions = gen.QuestionGenerator()
    print("--- %s seconds to Tokenize ---" % (time.time() - start_time))

    #Print
    start_time = time.time()
    with open(input_directory + "/" + section_name + '.csv', 'w') as csvfile:
        qawriter = csv.writer(csvfile)
        qawriter.writerow(["Q", "A"])
        for sentence in sentences:
            flashcard = questions.generate_question(sentence)
            if flashcard:
                qawriter.writerow([flashcard[0]['Q'], flashcard[0]['A']])
    print("--- %s seconds to Generate Questions ---" %
          (time.time() - start_time))
Example #10
0
def get_definitions(text,
                    return_sources=False,
                    decode_unicode=True) -> Generator:
    """
    Find possible definitions in natural language in text.
    The text will be split to sentences first.
    :param decode_unicode:
    :param return_sources: returns a tuple with the extracted term and the source sentence
    :param text: the input text
    :return:
    """

    for sentence in get_sentence_list(text):
        yield from get_definitions_in_sentence(sentence, return_sources,
                                               decode_unicode)
def run_sentence_token_test(text, result, lowercase=False, stopword=False):
    """
    Base test method to run against text with given results.
    """
    # Get list from text
    sentence_list = get_sentence_list(text)

    # Check length first
    assert len(sentence_list) == len(result)

    # Check each sentence matches
    for i in range(len(sentence_list)):
        tokens = lexnlp_tests.benchmark_extraction_func(get_token_list,
                                                        sentence_list[i], lowercase=lowercase, stopword=stopword)
        assert_list_equal(tokens, result[i])
Example #12
0
    def process(section_text, section_name):
        #Speedup: bash runStanfordParserServer.sh, bash runSSTServer.sh

        import sys
        import pip
        import spacy
        import neuralcoref
        import lexnlp.nlp.en.segments.sentences as lex_sentences
        import question_generator as gen
        import csv
        import time

        #Load
        start_time = time.time()
        brief = section_text
        print("--- %s seconds to Load ---" % (time.time() - start_time))

        start_time = time.time()
        pronouns = spacy.load('en')
        neuralcoref.add_to_pipe(pronouns,
                                greedyness=0.5,
                                max_dist=100,
                                blacklist=False)
        neural = pronouns(brief)
        brief = neural._.coref_resolved
        print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time))

        #Tokenize
        start_time = time.time()
        sentences = list(lex_sentences.get_sentence_list(brief))
        questions = gen.QuestionGenerator()
        print("--- %s seconds to Tokenize ---" % (time.time() - start_time))

        #Print
        start_time = time.time()

        for sentence in sentences:
            flashcard = questions.generate_question(sentence)
            if flashcard:
                partial = {
                    "question": flashcard[0]['Q'],
                    "answer": flashcard[0]['A']
                }
                result[section_name].append(partial)

        print("--- %s seconds to Generate Questions ---" %
              (time.time() - start_time))
def get_locations(text,
                  strict=False,
                  return_source=False,
                  window=2) -> Generator:
    """
    Get locations from text using Stanford libraries.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text))

        # Iterate through chunks
        locations = []
        last_loc_pos = None
        for i, token in enumerate(sentence_pos):
            # Check label
            if token[1] == 'LOCATION':
                if not strict and last_loc_pos is not None and (
                        i - last_loc_pos) < window:
                    locations[-1] += (" " if not token[0].startswith("'") else
                                      "") + token[0]
                else:
                    locations.append(token[0])
                last_loc_pos = i
            else:
                if token[0] in [".", ","]:
                    if not strict and last_loc_pos is not None and (
                            i - last_loc_pos) < window:
                        locations[-1] += (
                            " " if token[0] not in string.punctuation and
                            not token[0].startswith("'") else "") + token[0]
                        last_loc_pos = i

        # Cleanup and yield
        for location in locations:
            location = strip_unicode_punctuation(location).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield location, sentence
            else:
                yield location
def get_persons(text,
                strict=False,
                return_source=False,
                window=2) -> Generator:
    """
    Get persons from text using Stanford libraries.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = STANFORD_NER_TAGGER.tag(get_tokens_list(text))

        # Iterate through chunks
        names = []
        last_person_pos = None
        for i, token in enumerate(sentence_pos):
            # Check label
            if token[1] == 'PERSON':
                if not strict and last_person_pos is not None and (
                        i - last_person_pos) < window:
                    names[-1] += " " + token[0]
                else:
                    names.append(token[0])
                last_person_pos = i
            else:
                if token[0] in [".", ","]:
                    if not strict and last_person_pos is not None and (
                            i - last_person_pos) < window:
                        names[-1] += (" " if token[0] not in string.punctuation
                                      else "") + token[0]
                        last_person_pos = i

        # Cleanup and yield
        for name in names:
            name = strip_unicode_punctuation(name).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield name, sentence
            else:
                yield name
def get_conditions(text, strict=True) -> Generator:
    """
    Find possible conditions in natural language.
    :param text:
    :param strict:
    :return:
    """

    # Iterate through all potential matches
    for sentence in get_sentence_list(text):
        for match in RE_CONDITION.finditer(sentence):
            # Get individual group matches
            captures = match.capturesdict()
            num_pre = len(captures["pre"])
            num_post = len(captures["post"])

            # Skip if strict and empty pre/post
            if strict and (num_pre == 0 or num_post == 0):
                continue

            # Otherwise, append
            yield (captures["condition"].pop().lower(), captures["pre"].pop(),
                   captures["post"].pop())
Example #16
0
def get_persons(text,
                strict=False,
                return_source=False,
                window=2) -> Generator:
    """
    Get names from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))
        companies = list(get_company_annotations(text))

        # Iterate through chunks
        persons = []
        last_person_pos = None

        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() == 'PERSON':
                    if not strict and last_person_pos is not None and (
                            i - last_person_pos) < window:
                        persons[-1] += " " + " ".join([c[0] for c in chunk])
                    else:
                        persons.append(" ".join([c[0] for c in chunk]))
                    last_person_pos = i
            elif not strict and last_person_pos is not None and (
                    i - last_person_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    persons[-1] += " " + chunk[0]
                    last_person_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    persons[-1] += (" " if chunk[0].lower() in ["&", "and"]
                                    else "") + chunk[0]
                    last_person_pos = i
                else:
                    last_person_pos = None

        # Cleanup and yield
        for person in persons:
            # Cleanup
            person = person.strip()
            if len(person) <= 2:
                continue

            if PERSONS_STOP_WORDS.search(person):
                continue

            person = strip_unicode_punctuation(person).strip(
                string.punctuation).strip(string.whitespace)

            if contains_companies(person, companies):
                continue

            if person.lower().endswith(" and"):
                person = person[0:-4]
            elif person.endswith(" &"):
                person = person[0:-2]

            if return_source:
                yield person, sentence
            else:
                yield person
Example #17
0
def get_companies(text: str,
                  use_article: bool = False,
                  detail_type: bool = False,
                  parse_name_abbr: bool = False,
                  return_source: bool = False) -> Generator:
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    :param text:
    :param use_article:
    :param detail_type:
    :param parse_name_abbr:
    :param return_source:
    :return:
    """
    # Select regex
    re_c = RE_ARTICLE_COMPANY if use_article else RE_COMPANY

    # Iterate through sentences
    for sentence in get_sentence_list(text):
        for match in re_c.finditer(sentence):
            captures = match.capturesdict()
            company_type = captures["company_type_of"] or \
                           captures["company_type"] or \
                           captures["company_type_single"]
            company_type = "".join(company_type).strip(
                string.punctuation.replace(".", "") + string.whitespace)
            company_type = company_type or None

            company_name = "".join(captures["full_name"])
            if company_type:
                company_name = re.sub(r'%s$' % company_type, '', company_name)
            company_name = company_name.strip(
                string.punctuation.replace('&', '').replace(')', '') + string.whitespace)
            company_name = re.sub(r'^\s*(?:and|&|of)\s+|\s+(?:and|&|of)\s*$', '',
                                  company_name, re.IGNORECASE)
            if not company_name:
                continue

            # f.e., a Delaware company
            if company_name.lower().startswith('a ') or captures.get('article') == ['a']:
                continue

            company_description = captures["company_description_of"] or \
                                  captures["company_description_and"] or \
                                  captures["company_description"] or \
                                  captures["company_description_single"]
            company_description = "".join(company_description).strip(
                string.punctuation + string.whitespace)
            # catch ABC & Company LLC case
            if company_description.lower() == 'company' and \
                    ('& company' in company_name.lower() or 'and company' in company_name.lower()):
                company_description = None
            company_description = company_description or None
            if company_description:
                company_name = re.sub(r'[\s,]%s$' % company_description, '', company_name)
                if not company_name or \
                        ARTICLE_RE.fullmatch(company_name) or \
                        re.match(r'.+?\s(?:of|in)$', company_name.lower()):
                    continue
            if company_name in COMPANY_DESCRIPTIONS:
                continue

            abbr_name = "".join(captures["abbr_name"]) or None

            ret = (company_name,
                   company_type)
            if detail_type:
                ret += (COMPANY_TYPES[company_type.lower()]['abbr'] if company_type else None,
                        COMPANY_TYPES[company_type.lower()]['label'] if company_type else None)
            ret += (company_description,)
            if parse_name_abbr:
                ret += (abbr_name,)
            if return_source:
                ret += (sentence,)
            # no args:         = [company_name, company_type, company_description]
            # detail_type:     + [company_type_abbr, company_type_label]
            # parse_name_abbr: + [abbr_name]
            # return_source:   + [source]
            yield ret
    def process_fields(doc: LeaseDocument, doc_text: str, task: ExtendedTask):
        sentences = get_sentence_list(doc_text)
        # fields = detect_fields(sentences, groups=('address',))
        fields = detect_fields(sentences)

        doc.address = fields.get('address')
        if not doc.address:
            doc.address = detect_address_default(doc_text, sentences)

        # term
        doc.commencement_date = fields.get('commencement_date')
        doc.expiration_date = fields.get('expiration_date')

        term_tuple = fields.get('term')
        if term_tuple:
            term = timedelta(days=term_tuple[2])
            if doc.commencement_date and not doc.expiration_date:
                doc.expiration_date = doc.commencement_date + term
            elif not doc.commencement_date and doc.expiration_date:
                doc.commencement_date = doc.expiration_date - term

        if doc.commencement_date \
                and doc.expiration_date \
                and doc.commencement_date >= doc.expiration_date:
            doc.expiration_date = None

        # lease type
        pay_taxes = int(fields.get('pay_taxes') or False)
        pay_costs = int(fields.get('pay_costs') or False)
        pay_insurance = int(fields.get('pay_insurance') or False)
        lt = pay_taxes + pay_costs + pay_insurance
        if lt == 3:
            doc.lease_type = 'triple-net'
        elif lt == 2:
            doc.lease_type = 'double-net'
        elif lt == 1:
            doc.lease_type = 'single-net'
        else:
            doc.lease_type = 'gross'

        # property type
        property_types = list(fields.get('property_types__set') or set())
        property_types.sort()
        doc.property_type = '; '.join(property_types)

        # permitted use
        doc.permitted_uses = fields.get('permitted_use')

        # prohibited use
        doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions(
            fields.get('prohibited_use__list'))
        renew_duration_tuple = fields.get('renew_non_renew_notice')
        if renew_duration_tuple:
            doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2])

        auto_renew = fields.get('auto_renew')
        if auto_renew is not None:
            doc.auto_renew = auto_renew

        area_square_feet_list = fields.get('area_square_feet__list')
        if area_square_feet_list:
            doc.area_size_sq_ft = area_square_feet_list[0]

        doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions(
            fields.get('alterations_allowed__list'))

        security_deposit = fields.get('security_deposit__set')
        if security_deposit:
            doc.security_deposit = max(security_deposit)

        doc.rent_due_frequency = fields.get('rent_due_frequency')

        mean_rent_per_month = fields.get('mean_rent_per_month__set')
        if mean_rent_per_month:
            doc.mean_rent_per_month = max(mean_rent_per_month)
def process_document(document):
    doc_words = []
    for sentence in get_sentence_list(document):
        doc_words.extend(process_sentence(sentence))
    return doc_words
Example #20
0
    def process_fields(doc: LeaseDocument, doc_text: str, task: ExtendedTask):
        sentences = get_sentence_list(doc_text)
        # fields = detect_fields(sentences, groups=('address',))
        fields = detect_fields(sentences)

        doc.address = fields.get('address')
        if not doc.address:
            doc.address = detect_address_default(doc_text, sentences)

        if doc.address:
            g = geocoder.google(doc.address)
            if g.ok:
                doc.address_latitude = g.lat
                doc.address_longitude = g.lng
                doc.address_country = g.country_long
                doc.address_state_province = g.province_long
            elif g.status and 'ZERO' in g.status:
                # Google does not know such address - probably we detected it wrong.
                doc.address = None
                doc.address_state_province = None
                doc.address_country = None
                doc.address_longitude = None
                doc.address_latitude = None
            else:
                task.log_warn(
                    'Google did not return geocode info for: {0}\nResponse: {1}'.format(doc.address,
                                                                                        g))
        # return

        # term
        doc.commencement_date = fields.get('commencement_date')
        doc.expiration_date = fields.get('expiration_date')

        term_tuple = fields.get('term')
        if term_tuple:
            term = timedelta(days=term_tuple[2])
            if doc.commencement_date and not doc.expiration_date:
                doc.expiration_date = doc.commencement_date + term
            elif not doc.commencement_date and doc.expiration_date:
                doc.commencement_date = doc.expiration_date - term

        if doc.commencement_date \
                and doc.expiration_date \
                and doc.commencement_date >= doc.expiration_date:
            doc.expiration_date = None

        # lease type
        pay_taxes = int(fields.get('pay_taxes') or False)
        pay_costs = int(fields.get('pay_costs') or False)
        pay_insurance = int(fields.get('pay_insurance') or False)
        lt = pay_taxes + pay_costs + pay_insurance
        if lt == 3:
            doc.lease_type = 'triple-net'
        elif lt == 2:
            doc.lease_type = 'double-net'
        elif lt == 1:
            doc.lease_type = 'single-net'
        else:
            doc.lease_type = 'gross'

        # property type
        property_types = list(fields.get('property_types__set') or set())
        property_types.sort()
        doc.property_type = '; '.join(property_types)

        # permitted use
        doc.permitted_uses = fields.get('permitted_use')

        # prohibited use
        doc.prohibited_uses = ProcessLeaseDocuments.ordered_list_without_repetitions(
            fields.get('prohibited_use__list'))
        renew_duration_tuple = fields.get('renew_non_renew_notice')
        if renew_duration_tuple:
            doc.renew_non_renew_notice_duration = timedelta(days=renew_duration_tuple[2])

        auto_renew = fields.get('auto_renew')
        if auto_renew is not None:
            doc.auto_renew = auto_renew

        area_square_feet_list = fields.get('area_square_feet__list')
        if area_square_feet_list:
            doc.area_size_sq_ft = area_square_feet_list[0]

        doc.alterations_allowed = ProcessLeaseDocuments.ordered_list_without_repetitions(
            fields.get('alterations_allowed__list'))

        security_deposit = fields.get('security_deposit__set')
        if security_deposit:
            doc.security_deposit = max(security_deposit)

        doc.rent_due_frequency = fields.get('rent_due_frequency')

        mean_rent_per_month = fields.get('mean_rent_per_month__set')
        if mean_rent_per_month:
            doc.mean_rent_per_month = max(mean_rent_per_month)
Example #21
0
def test_sentence_segmenter_empty():
    """
    Test basic sentence segmentation.
    """
    _ = get_sentence_list("")
Example #22
0
import sys
import lexnlp.nlp.en.segments.sentences as lex_sentences
import lexnlp.extract.en.dates as lex_dates
import lexnlp.extract.en.entities.nltk_maxent as lex_entities

direct_path = "/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/Dish_Sample.txt"

with open(direct_path, 'r') as file:
    brief = file.read()

processed_brief = lex_sentences.pre_process_document(brief)
sentences_brief = lex_sentences.get_sentence_list(processed_brief)

facts = []

for sentence in sentences_brief:
    entities = lex_entities.get_persons(sentence)
    for entity in entities:
        facts.append((entity, sentence))

for fact in facts:
    print("Question:\nWhy is {} relevant?\n\nAnswer:\n{}".format(
        fact[0], fact[1]))
    print("\n---------------\n")
'''
Question:
Why is Farmers Branch relevant?

Answer:
In 2009, DISH began a pilot program to test QPC, a new incentive-based system at several locations, including two of its eight offices in the North Texas region: Farmers Branch and North Richland Hills.
Example #23
0
                        else:
                            text_content = ''
                    except Exception as e:
                        print(("error in content extraction", e))
                        continue

                    # skip if empty
                    if text_content is None:
                        continue
                    if len(text_content.strip()) == 0:
                        continue

                    try:
                        # build word2vec sentence list and doc2vec content simultaneously
                        doc_stems = []
                        for sentence in get_sentence_list(text_content):
                            sentence_stems = [
                                s for s in get_stem_list(
                                    sentence, stopword=True, lowercase=True)
                                if s.isalpha()
                            ]
                            doc_stems.extend(sentence_stems)
                            sentences.append(sentence_stems)
                        documents.append(
                            gensim.models.doc2vec.TaggedDocument(
                                doc_stems,
                                ["{0}".format(court_tar_member.name)]))
                    except Exception as e:
                        print(e)

    # word2vec models
Example #24
0
def get_companies(text: str,
                  strict: bool = False,
                  use_gnp: bool = False,
                  detail_type: bool = False,
                  count_unique: bool = False,
                  name_upper: bool = False,
                  parse_name_abbr: bool = False,
                  return_source: bool = False):
    """
    Find company names in text, optionally using the stricter article/prefix expression.
    :param text:
    :param strict:
    :param use_gnp: use get_noun_phrases or NPExtractor
    :param detail_type: return detailed type (type, unified type, label) vs type only
    :param name_upper: return company name in upper case.
    :param count_unique: return only unique companies - case insensitive.
    :param parse_name_abbr: return company abbreviated name if exists.
    :param return_source:
    :return:
    """
    # skip if all text is in uppercase
    if text == text.upper():
        return
    valid_punctuation = VALID_PUNCTUATION + ["(", ")"]

    unique_companies = dict()

    if COMPANY_TYPES_RE.search(text):
        # Iterate through sentences
        for sentence in get_sentence_list(text):
            # skip if whole phrase is in uppercase
            if sentence == sentence.upper():
                continue
            if use_gnp:
                phrases = get_noun_phrases(sentence, strict=strict,
                                           valid_punctuation=valid_punctuation)
            else:
                phrases = np_extractor.get_np(sentence)
            for phrase in phrases:
                if COMPANY_TYPES_RE.search(phrase):
                    for result in nltk_re.get_companies(phrase,
                                                        detail_type=True,
                                                        parse_name_abbr=True):
                        co_name, co_type, co_type_abbr, co_type_label, co_desc, co_abbr = result

                        if co_name == co_type or co_name == co_desc:
                            continue
                        if name_upper:
                            co_name = co_name.upper()

                        result = (co_name, co_type)

                        if detail_type:
                            result += (co_type_abbr, co_type_label, co_desc)
                        if parse_name_abbr:
                            result += (co_abbr,)
                        if return_source and not count_unique:
                            result = result + (sentence,)

                        if count_unique:
                            unique_key = (result[0].lower() if result[0] else None, co_type_abbr)
                            existing_result = unique_companies.get(unique_key)

                            if existing_result:
                                unique_companies[unique_key] = existing_result[:-1] + (existing_result[-1] + 1,)
                            else:
                                unique_companies[unique_key] = result + (1,)
                        else:
                            yield result

        if count_unique:
            for company in unique_companies.values():
                yield company
Example #25
0
##brief = lex_sentences.pre_process_document(brief)
##print("--- %s seconds to LexNLP Preprocess---" % (time.time() - start_time))

start_time = time.time()
pronouns = spacy.load('en')
neuralcoref.add_to_pipe(pronouns,
                        greedyness=0.5,
                        max_dist=100,
                        blacklist=False)
neural = pronouns(brief)
brief = neural._.coref_resolved
print("--- %s seconds to Pronoun Fix ---" % (time.time() - start_time))

#Tokenize
start_time = time.time()
sentences = list(lex_sentences.get_sentence_list(brief))
questions = gen.QuestionGenerator()
print("--- %s seconds to Tokenize ---" % (time.time() - start_time))

#Print
start_time = time.time()
with open(
        '/Users/brandon/Documents/Northwestern Courses/Winter 2019/CS+Law Innovation Lab/Orrick, Harrington, & Sutcliffe/Documents/ex.csv',
        'w') as csvfile:
    qawriter = csv.writer(csvfile)
    qawriter.writerow(["Q", "A"])
    for sentence in sentences:
        flashcard = questions.generate_question(sentence)
        if flashcard:
            qawriter.writerow([flashcard[0]['Q'], flashcard[0]['A']])
print("--- %s seconds to Generate csv ---" % (time.time() - start_time))
Example #26
0
def get_organizations(text,
                      strict=False,
                      return_source=False,
                      window=2) -> Generator:
    """
    Get organizations from text.
    :param window:
    :param return_source:
    :param strict:
    :param text:
    :return:
    """
    # Iterate through sentences
    for sentence in get_sentence_list(text):
        # Tag sentence
        sentence_pos = nltk.pos_tag(get_token_list(sentence))

        # Iterate through chunks
        organizations = []
        last_org_pos = None
        for i, chunk in enumerate(nltk.ne_chunk(sentence_pos)):
            if type(chunk) == nltk.tree.Tree:
                # Check label
                if chunk.label() in ['ORGANIZATION']:
                    if not strict and last_org_pos is not None and (
                            i - last_org_pos) < window:
                        organizations[-1] += " " + " ".join(
                            [c[0] for c in chunk])
                    else:
                        organizations.append(" ".join([c[0] for c in chunk]))
                    last_org_pos = i
            elif not strict and last_org_pos is not None and (
                    i - last_org_pos) < window:
                if chunk[1] in ["NNP", "NNPS"]:
                    organizations[-1] += " " + chunk[0]
                    last_org_pos = i
                elif chunk[1] in ["CC"] or chunk[0] in VALID_PUNCTUATION:
                    if chunk[0].lower() in ["or"]:
                        continue
                    organizations[-1] += (" " if chunk[0].lower()
                                          in ["&", "and"] else "") + chunk[0]
                    last_org_pos = i
                else:
                    last_org_pos = None

        for org in organizations:
            # Cleanup
            org = org.strip()
            if len(org) <= 2:
                continue

            if org.lower().endswith(" and"):
                org = org[0:-4]
            elif org.endswith(" &"):
                org = org[0:-2]

            org = strip_unicode_punctuation(org).strip(
                string.punctuation).strip(string.whitespace)
            if return_source:
                yield org, sentence
            else:
                yield org
Example #27
0
 def get_sentences(self, text=None):
     if not text:
         text = self.text
     return list(lex_sentences.get_sentence_list(text))