Example #1
0
def test_phrase_matcher_sent_start(en_vocab, attr):
    _ = PhraseMatcher(en_vocab, attr=attr)  # noqa: F841
Example #2
0
#Tokenizing
for token in doc:
    print("tokens: ", token)

#Text preprocessing
print(f"Token \t\tLemma \t\tStopword".format('Token', 'Lemma', 'Stopword'))
print("-" * 40)
for token in doc:
    print(f"{str(token)}\t\t{token.lemma_}\t\t{token.is_stop}")

#---------------------------------------
#Pattern Matching
from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher(nlp.vocab, attr='LOWER')

terms = ['Galaxy Note', 'iPhone 11', 'iPhone XS', 'Google Pixel']
patterns = [nlp(text) for text in terms]
matcher.add("TerminologyList", None, *patterns)
print("i) ", patterns)

text_doc = nlp(
    "Glowing review overall, and some really interesting side-by-side "
    "photography tests pitting the iPhone 11 Pro against the "
    "Galaxy Note 10 Plus and last year’s iPhone XS and Google Pixel 3.")

matches = matcher(text_doc)
print(matches)

match_id, start, end = matches[0]
Example #3
0
def deidentifier_func(input_string, nlp_trained_model, nlp_blank_model,
                      choice):
    #doc = nlp_trained_model((open(input_string)).read())
    doc = nlp_trained_model(input_string)
    #original_string = open((input_string)).read()
    original_string = input_string

    # ** Calling extract_regex function to get list of all the matched regex pattern **
    date_list = extract_regex(
        r"\D([0-9]{4}|[0-9]{1,2})(\/|-)[0-9]{1,2}(\/|-)([0-9]{1,2}|[0-9]{4})\D",
        doc, original_string)

    for i in range(len(date_list)):
        date_list[i][1] = date_list[i][1] + 1
        date_list[i][2] = date_list[i][2] - 1
        date_list[i][0] = original_string[date_list[i][1]:date_list[i][2]]

    # ** For choice 1 **
    """if(choice == 1):
        for a in date_list:
            count = 0
            for i in range(a[1], a[1] + 4):
                if(original_string[i].isnumeric()):
                    count = count + 1
            if(count == 4):
                original_string=original_string[:a[1]+4]+''*(a[2]-a[1]-4)+original_string[a[2]:]
            else:
                count = 0
                for j in range(a[2], a[2]-5, -1):
                    if(original_string[j].isnumeric()):
                        count = count + 1
                if(count == 4):
                    original_string=original_string[:a[1]]+''*(a[2]-a[1]-4)+original_string[a[2]-4:]
                elif(count == 3):
                    original_string=original_string[:a[1]]+''*(a[2]-a[1]-2)+original_string[a[2]-2:]
                else:
                    original_string=original_string[:a[1]]+''*(a[2]-a[1])+original_string[a[2]:]

    """
    # ** For Choice 2 **
    date_shift = []
    temp_1 = 0
    temp_2 = 0
    random_value = randint(0, 90)
    if (choice == 2):
        for temp in range(len(date_list)):
            temp_list = []
            text = date_list[temp][0]
            start = date_list[temp][1] + temp_2
            end = date_list[temp][2] + temp_2
            # Converting dates to pandas datetime so as to use timedelta function
            pandas_date = pd.to_datetime(text,
                                         infer_datetime_format=True,
                                         errors='ignore')
            if (type(pandas_date) != str):
                pandas_date = pandas_date + timedelta(days=random_value)
                original_string = original_string[:start] + str(
                    pandas_date)[:-9] + original_string[end:]
                temp_2 = temp_2 + (len(str(pandas_date)[:-9]) - len(text))
                temp_list.append(str(pandas_date)[:-9])
                temp_list.append(start)
                temp_list.append(start + len(str(pandas_date)[:-9]))
                date_shift.append(temp_list)

    # ** Extracting all various identifiers using regex pattern **
    #dob_list = extract_regex(r"^(0[1-9]|1[012])[-/.](0[1-9]|[12][0-9]|3[01])[-/.](19|20)\\d\\d$",
    #                         doc, original_string)

    aadhar_list = extract_regex(r"(\d{4}(\s|\-)\d{4}(\s|\-)\d{4})", doc,
                                original_string)

    ssn_list = extract_regex(r"^\d{9}$", doc, original_string)

    mail_list = extract_regex(
        r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])",
        doc, original_string)

    ip_list = extract_regex(
        r"((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)",
        doc, original_string)

    # ** Now de-identifying them **
    #for a in dob_list:
    #    original_string = original_string[:a[1]]+'X'*(a[2]-a[1])+original_string[a[2]:]

    for a in aadhar_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in ssn_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in mail_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in ip_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    # ** Now to extract urls and licence plate numbers from last updated original_string
    #    and then deidentifying them too **
    doc = nlp_trained_model(original_string)
    url_list = extract_regex(
        r"(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?",
        doc, original_string)

    license_plate_list = extract_regex(
        r"[A-Z]{2}[ -][0-9]{1,2}(?: [A-Z])?(?: [A-Z]*)? [0-9]{4}", doc,
        original_string)

    for a in ip_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in ip_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    # ** Now to extract contact details i.e phone numbers and fax numbers from last updated
    #    original_string and then deidentifying them too **
    doc = nlp_trained_model(original_string)
    #indian_ph_no = extract_regex(r"((\+*)((0[ -]+)*|(91 )*)(\d{12}+|\d{10}+))|\d{5}([- ]*)\d{6}",
    #                               doc, original_string)

    usa_ph_no = extract_regex(r"^(\([0-9]{3}\) |[0-9]{3}-)[0-9]{3}-[0-9]{4}$",
                              doc, original_string)

    phone_fax_list1 = extract_regex(
        r"(?:(?:(?:(\+)((?:[\s.,-]*[0-9]*)*)(?:\()?\s?((?:[\s.,-]*[0-9]*)+)(?:\))?)|(?:(?:\()?(\+)\s?((?:[\s.,-]*[0-9]*)+)(?:\))?))((?:[\s.,-]*[0-9]+)+))",
        doc, original_string)

    phone_fax_list2 = extract_regex(r"\D(\+91[\-\s]?)?[0]?(91)?[789]\d{9}\D",
                                    doc, original_string)

    for i in range(len(phone_fax_list2)):
        phone_fax_list2[i][1] = phone_fax_list2[i][1] + 1
        phone_fax_list2[i][2] = phone_fax_list2[i][2] - 1
        phone_fax_list2[i][0] = original_string[
            phone_fax_list2[i][1]:phone_fax_list2[i][2]]

    phone_fax_list = []
    for a in phone_fax_list1:
        phone_fax_list.append(a)
    for a in phone_fax_list2:
        phone_fax_list.append(a)

    for a in phone_fax_list1:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in phone_fax_list2:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    #for a in indian_ph_no:
    #    original_string = original_string[:a[1]]+'X'*(a[2]-a[1])+original_string[a[2]:]

    for a in usa_ph_no:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    # ** Extracting account details and other identification details and deidentifying them**
    doc = nlp_trained_model(original_string)

    pan_list = extract_regex(r"[A-Z]{5}\d{4}[A-Z]{1}", doc, original_string)

    passport_list = extract_regex(r"[A-Z]{1}\d{7}", doc, original_string)

    account_and_serial_list = extract_regex(r"\d{9,18}", doc, original_string)

    credit_card_list = extract_regex(
        r"\d{5}(\s|\-)\d{5}(\s|\-)\d{5}|\d{4}(\s|\-)\d{4}(\s|\-)\d{4}(\s|\-)\d{4}",
        doc, original_string)

    for a in pan_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in passport_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in account_and_serial_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    for a in credit_card_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    # ** Extracting MRN(Medical Report Number) if present and assumning it to be 7 digit**
    doc = nlp_trained_model(original_string)
    mrn_list = extract_regex(r"\d{7}", doc, original_string)

    for a in mrn_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    # Now we've deidentified all the details except address

    # ** For extracting address we use a list of address_identifiers for addresses smaller
    #    than street names and match them with every element in spacy doc object.
    #    Matched object are then added to our address_list **

    address_identifier = [
        'st', 'niwas', 'aawas', 'palace', 'road', 'block', 'gali', 'sector',
        'flr', 'floor', 'path', 'near', 'oppo', 'bazar', 'house', 'nagar',
        'bypass', 'bhawan', 'street', 'rd', 'sq', 'flat', 'lane', 'gali',
        'circle', 'bldg', 'ave', 'mandal', 'avenue', 'tower', 'nagar', 'marg',
        'chowraha', 'lane', 'heights', 'plaza', 'park', 'garden', 'gate',
        'villa', 'market', 'apartment', 'chowk'
    ]

    doc = nlp_trained_model(original_string)
    address_list = []

    for i in doc:
        if (len(i) > 1 and '\n' not in str(i)):
            if (str(i).lower() in address_identifier):
                address_list.append(i)

    # ** Now to remove the identified addresses after getting their position in og_string
    address_index = []
    temp_2 = 0
    length = len(original_string)
    for i in address_list:
        while (1):
            index = original_string.find(str(i), temp_2, length)
            if (index == -1):
                break
            if (index != 0 and index != length):
                if ((original_string[index - 1].isalpha()
                     or original_string[index + len(str(i))].isalpha())):
                    temp_2 = index + len(str(i))
                else:
                    break
        address_index.append(index)
        temp_2 = index + len(str(i))

    temp_1 = 0
    new_address_list = []
    if (address_index != []):
        temp_1 = address_index[0]
        a = []
        for b in address_index:
            if (b - temp_1 < 20):
                a.append(b)
                temp_1 = b
            else:
                new_address_list.append(a)
                a = []
                a.append(b)
                temp_1 = b
        new_address_list.append(a)

    # ** Removing the complete word in which the addres_identifier was used **
    addr_list = []
    for a in new_address_list:
        flag = []
        j = a[0]
        while (j != -1 and original_string[j] not in [',', '\n', '.', ';']):
            j = j - 1
        startt = j
        index_1 = startt
        count = 8
        while (count and j != -1 and original_string[j] != '\n'):
            if (original_string[j].isdigit()):
                startt = j
            j = j - 1
            count = count - 1
        j = a[-1]
        #print(j)
        while (j != -1 and original_string[j] not in [',', '\n', '.', ';']):
            j = j + 1
        endd = j
        index_2 = endd
        count = 7
        while (count and j != length and original_string[j] != '\n'):
            if (original_string[j].isdigit()):
                endd = j
            j = j + 1
            count = count - 1

        if ((original_string[index_1] != '.'
             or original_string[index_2] != '.') and (index_2 - index_1) < 50):
            if (original_string[startt] == '\n'):
                startt = startt + 1
            if (original_string[endd] == '\n'):
                endd = endd - 1
            flag.append(original_string[startt:endd + 1])
            flag.append(startt)
            flag.append(endd)
            addr_list.append(flag)

    for a in addr_list:
        original_string = original_string[:a[1]] + 'X' * (
            a[2] - a[1]) + original_string[a[2]:]

    # ** After deidentifying all these details we are now left with only names, dates, age
    #    which cannot be identified by regular expression **

    # To extract dates we use spacy's pre-trained en_core_web_sm model along with
    # some modifications to the default model according to our requirements

    time_identifier = [
        'YEAR', 'YEARS', 'AGE', 'AGES', 'MONTH', 'MONTHS', 'DECADE', 'CENTURY',
        'WEEK', 'DAILY', 'DAY', 'DAYS', 'NIGHT', 'NIGHTS', 'WEEKLY', 'MONTHLY',
        'YEARLY'
    ]

    doc_1 = nlp_trained_model(original_string)
    new_date_list = []
    for entities in doc_1.ents:
        if (str(entities.text).count('X') < 2):
            date = []
            if (entities.label_ == 'DATE' and (sum([
                    True
                    if i not in original_string[entities.start_char:entities.
                                                end_char].upper() else False
                    for i in time_identifier
            ]) == len(time_identifier))
                    and (entities.end_char - entities.start_char) > 4 and
                    sum(c.isdigit()
                        for c in original_string[entities.start_char:entities.
                                                 end_char]) >= 1 and
                    sum(c.isalpha()
                        for c in original_string[entities.start_char:entities.
                                                 end_char]) >= 1):
                date.append(entities.text)
                date.append(entities.start_char)
                date.append(entities.end_char)
                new_date_list.append(date)

    for a in new_date_list:
        count = 0
        for i in range(a[1], a[1] + 4):
            if (original_string[i].isnumeric()):
                count = count + 1
        if (count == 4):
            original_string = original_string[:a[1] + 4] + 'X' * (
                a[2] - a[1] - 4) + original_string[a[2]:]
        else:
            count = 0
            for j in range(a[2], a[2] - 5, -1):
                if (original_string[j].isnumeric()):
                    count = count + 1
                if (count == 4):
                    original_string = original_string[:a[1]] + 'X' * (
                        a[2] - a[1] - 4) + original_string[a[2] - 4:]
                elif (count == 3):
                    original_string = original_string[:a[1]] + 'X' * (
                        a[2] - a[1] - 2) + original_string[a[2] - 2:]
                else:
                    original_string = original_string[:a[1]] + 'X' * (
                        a[2] - a[1]) + original_string[a[2]:]

    final_date_list = []
    if (choice == 1):
        for a in new_date_list:
            final_date_list.append(a)
        for a in new_date_list:
            final_date_list.append(a)

    # final_date_list contains all the dates we extracted including regex and spacy model

    # ** Now going for age part, we use the spacy's phrasematcher
    #    which takes input as patterns we want to match and
    #    outputs the start and end index of matched pattern **

    try:
        age_list = []
        matcher = PhraseMatcher(nlp_trained_model.vocab, attr="SHAPE")
        age_identifier = [
            'YEAR', 'YEARS', 'Y/O', 'AGES', 'AGE', 'Y.O', 'Y.O.', 'AGED',
            'AGE IS'
        ]
        matcher.add("age", None, nlp_blank_model("76 year old"),
                    nlp_blank_model("aged 58"), nlp_blank_model('aged 123'),
                    nlp_blank_model("54 y/o"), nlp_blank_model("age is 59"),
                    nlp_blank_model("123 y/o"), nlp_blank_model("ages 35"),
                    nlp_blank_model("age 45"), nlp_blank_model("ages 123"),
                    nlp_blank_model("age 123"),
                    nlp_blank_model("54 years old"),
                    nlp_blank_model("124 years old"), nlp3("41 y.o."),
                    nlp_blank_model("123 y.o."),
                    nlp_blank_model('113 year old'))

        doc = nlp_blank_model(original_string)
        for match_id, start, end in matcher(doc):
            if (sum([
                    True if i in str(doc[start:end]).upper() else False
                    for i in age_identifier
            ]) >= 1):
                a = []
                for i in range(start, end):
                    if (str(doc[i:i + 1]).isnumeric()):
                        if (int(str(doc[i:i + 1])) > 89):
                            result = st.find(str(doc[start:end]))
                            count = 0
                            for j in range(result,
                                           result.len(str(doc[start:end]))):
                                if (original_string[j:j + 1].isnumeric()
                                        and count == 0):
                                    sstart = j
                                if (original_string[j:j + 1].isnumeric()):
                                    count = count + 1
                            a.append(original_string[sstart:sstart + count])
                            a.append(sstart)
                            a.append(sstart + count)
                            age_list.append(a)
                            original_string = original_string[:
                                                              sstart] + 'X' * count + original_string[
                                                                  sstart +
                                                                  count:]
    except:
        None

    # ** Last step is packing all the extracted pattern in a dict
    info_dict = {}
    info_dict['date'] = final_date_list
    #info_dict['dob'] = dob_list
    info_dict['aadhar'] = aadhar_list
    info_dict['ssn'] = ssn_list
    info_dict['mail'] = mail_list
    info_dict['ip'] = ip_list
    info_dict['url'] = url_list
    info_dict['licence_plate'] = license_plate_list
    #info_dict['indian_ph_no'] = indian_ph_no
    info_dict['usa_ph_no'] = usa_ph_no
    info_dict['phone_fax'] = phone_fax_list
    info_dict['pan'] = pan_list
    info_dict['passport'] = passport_list
    info_dict['account_details'] = account_and_serial_list
    info_dict['credit_card'] = credit_card_list
    info_dict['age'] = age_list
    info_dict['address'] = addr_list
    info_dict['medical_report_no'] = mrn_list
    info_dict['date_shift'] = date_shift

    shift = random_value

    if (choice == 1):
        return (original_string, info_dict, None)
    else:
        return (original_string, info_dict, shift)
def test_phrase_matcher_contains(en_vocab):
    matcher = PhraseMatcher(en_vocab)
    matcher.add("TEST", None, Doc(en_vocab, words=["test"]))
    assert "TEST" in matcher
    assert "TEST2" not in matcher
Example #5
0
    def get_array_from_df_combined(self, df):

        rows = df.text.tolist()
        rows = [
            t.replace("\n", "рдХ") for t in rows
        ]  ## as spacy cannot handle consecutive newlines
        sep = " "
        text = sep.join(rows)

        if nlp.max_length < len(text):
            nlp.max_length = 1 + len(text)
        rows_token_indexes_in_text = list(
            np.cumsum([len(a) for a in nlp.tokenizer.pipe(rows)])
        )
        total_tokens = rows_token_indexes_in_text.pop()
        # assert(total_tokens == 1 + len(list(nlp.tokenizer.pipe([text]))[0]) ) - len(rows_token_indexes_in_text)

        def set_custom_boundaries(doc):
            for token_index in rows_token_indexes_in_text:
                doc[token_index].is_sent_start = True
            return doc

        nlp.add_pipe(set_custom_boundaries, before="tagger")
        doc = nlp(text)

        result_df = pd.DataFrame([], columns=["spacy_bin", "spacy_cat"])

        ## bigint features
        if self.cat_features:
            spacy_bigint_attributes = (
                self.spacy_vorn_attributes + self.spacy_vocab_attributes
            )
            tokens_features_bigint = doc.to_array(
                spacy_bigint_attributes
            ).astype("object")
            for i in range(tokens_features_bigint.shape[0]):
                for j in range(tokens_features_bigint.shape[1]):
                    tokens_features_bigint[i][j] = nlp.vocab[
                        tokens_features_bigint[i][j]
                    ].text
            tokens_features_big = np.split(
                tokens_features_bigint, rows_token_indexes_in_text
            )
            result_df["spacy_cat"] = tokens_features_big

        small_feat_list = []

        ## smallint features
        if self.num_features:
            tokens_features_smallint = doc.to_array(
                self.spacy_num_attributes
            ).astype("int8")
            small_feat_list.append(tokens_features_smallint)

        ## gzt features
        if self.gzt_features:
            phrase_matcher = PhraseMatcher(nlp.vocab)
            gzt_attributes = [a.upper() for a in list(self.GZT_LISTS.keys())]
            gzt_index_map = dict()
            for i, a in enumerate(gzt_attributes):
                gzt_index_map[nlp.vocab.strings[a]] = i

            gzt_patterns = list()
            for label, terms in self.GZT_LISTS.items():
                patterns = [nlp.make_doc(text) for text in terms]
                phrase_matcher.add(label.upper(), None, *patterns)

            gzt_matches = phrase_matcher(doc)

            token_gzt_features = np.zeros(
                shape=[len(doc), len(gzt_attributes)], dtype="int8"
            )

            for match_id, start, end in gzt_matches:
                gzt_attribute_index = gzt_index_map[match_id]

                span = doc[start:end]
                if span is not None:
                    for token in span:
                        # print(token.i, token)
                        token_gzt_features[token.i, gzt_attribute_index] = 1

            small_feat_list.append(token_gzt_features)

        # tokens_features_small = np.concatenate((tokens_features_smallint, token_gzt_features), axis=1)
        if len(small_feat_list) > 0:
            tokens_features_small = np.hstack(small_feat_list)
            tokens_features_small = np.split(
                tokens_features_small, rows_token_indexes_in_text
            )
            result_df["spacy_bin"] = tokens_features_small

        return result_df
Example #6
0
def build_phrase_matcher(name: str, phrases: List[str]) -> PhraseMatcher:
    """Builds a PhraseMatcher object."""
    matcher = PhraseMatcher(nlp.tokenizer.vocab)
    matcher.add(name, phrases)
    return matcher
Example #7
0
def create_profile(text):
    name = text[1]
    text.remove(text[0])
    text.remove(text[1])
    text = str(text)
    text = text.lower()
    #below is the csv where we have all the keywords, you can customize your own
    loggingStart("CLASSIFYING KEYWORDS")
    keyword_dict = pd.read_csv('data.csv', encoding="ISO-8859-1")
    loggingEnd("CLASSIFYING KEYWORDS")
    loggingStart("CLASSIFYING ML BASED WORDS")
    ML_words = [
        nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis=0)
    ]
    loggingEnd("CLASSIFYING ML BASED WORDS")
    loggingStart("CLASSIFYING DL BASED WORDS")
    DL_words = [
        nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis=0)
    ]
    loggingEnd("CLASSIFYING DL BASED WORDS")
    loggingStart("CLASSIFYING PYTHON BASED WORDS")
    python_words = [
        nlp(text) for text in keyword_dict['Python Language'].dropna(axis=0)
    ]
    loggingEnd("CLASSIFYING PYTHON BASED WORDS")
    loggingStart("CLASSIFYING WEB BASED WORDS")
    web_words = [nlp(text) for text in keyword_dict['Web'].dropna(axis=0)]
    loggingEnd("CLASSIFYING WEB BASED WORDS")
    loggingStart("CLASSIFYING CYBER SECURITY BASED WORDS")
    security_words = [
        nlp(text) for text in keyword_dict['Cyber security'].dropna(axis=0)
    ]
    loggingEnd("CLASSIFYING CYBER SECURITY WORDS")

    loggingStart("MATCHING")
    matcher = PhraseMatcher(nlp.vocab)

    matcher.add('ML', None, *ML_words)
    matcher.add('DL', None, *DL_words)
    matcher.add('Web', None, *web_words)
    matcher.add('Python', None, *python_words)
    matcher.add('CS', None, *security_words)
    doc = nlp(text)

    d = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[
            match_id]  # get the unicode ID, i.e. 'COLOR'
        span = doc[start:end]  # get the matched slice of the doc
        d.append((rule_id, span.text))
    loggingEnd("MATCHING")
    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i, j in Counter(d).items())
    #print(str(keywords))
    ## convertimg string of keywords to dataframe
    loggingStart("CONVERT TO DATAFRAMES")
    df = pd.read_csv(StringIO(keywords), names=['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ', 1).tolist(),
                       columns=['Subject', 'Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(', 1).tolist(),
                       columns=['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'], df2['Keyword'], df2['Count']], axis=1)
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
    label = list(df3['Subject'])
    loggingEnd("CONVERT TO DATAFRAMES")
    count = {}
    x = set(label)
    for i in x:
        y = label.count(i)
        count.update({y: i})
    final = max(list(count.keys()))

    data = {
        'Candidate Name': name,
        'Subject': [count[final]],
        'Count': [final]
    }

    df4 = pd.DataFrame(data)
    return (df4)
def get_validator_matches(text):
    match_ents.clear()

    matcher = Matcher(nlp.vocab)
    phraseMatcher = PhraseMatcher(nlp.vocab, attr="LEMMA")

    adverbPattern = [{"POS": "ADV"}]
    matcher.add("Adverbs", match_adverb, adverbPattern)

    adjectivePattern = [{"POS": "ADJ"}]
    matcher.add("Adjectives", match_adjective, adjectivePattern)

    pastTenseVerbPattern1 = [{"TAG": "VBD"}]
    pastTenseVerbPattern2 = [{"TAG": "VBN"}]
    matcher.add("Passive Voice", match_passive, pastTenseVerbPattern1,
                pastTenseVerbPattern2)

    infinitivePattern1 = [{"LOWER": "be"}, {"POS": "ADJ"}, {"POS": "ADP"}]
    infinitivePattern2 = [{"LOWER": "to"}, {"POS": "VERB"}]
    matcher.add("Infinitive", match_infinitive, infinitivePattern1,
                infinitivePattern2)

    pronounPattern = [{"POS": "PRON"}]
    matcher.add("Pronoun", match_pronoun, pronounPattern)

    indefiniteArticles = ["a", "an"]
    indefiniteArticlePatterns = [nlp(text) for text in indefiniteArticles]
    phraseMatcher.add("Indefinite Articles", match_indefinite_articles,
                      *indefiniteArticlePatterns)

    vagueTerms = [
        "some", "any", "allowable", "several", "many", "lot of", "a few",
        "almost always", "very nearly", "nearly", "about", "close to",
        "almost", "approximate"
    ]
    vagueTermsPatterns = [nlp(text) for text in vagueTerms]
    phraseMatcher.add("Vague Terms", match_vague_terms, *vagueTermsPatterns)

    escapeClauses = [
        "so far as is possible", "as possible", "as little as possible",
        "where possible", "as much as possible",
        "if it should prove necessary", "if necessary",
        "to the extent necessary", "as appropriate", "as required",
        "to the extent practical", "if practicable"
    ]
    escapeClausesPatterns = [nlp(text) for text in escapeClauses]
    phraseMatcher.add("Escape Clauses", match_escape_clauses,
                      *escapeClausesPatterns)

    openEndedClauses = ["including but not limitedd to", "etc", "and so on"]
    openEndedPatterns = [nlp(text) for text in openEndedClauses]
    phraseMatcher.add("Open Ended Clauses", match_open_ended_clauses,
                      *openEndedPatterns)

    notTerms = ["not"]
    notPatterns = [nlp(text) for text in notTerms]
    phraseMatcher.add("Negations", match_negations, *notPatterns)

    universalQuantifiers = [
        "all", "any", "both", "completely", "prompt", "fast", "minimum",
        "maximum", "optimum"
    ]
    universalPatterns = [nlp(text) for text in universalQuantifiers]
    phraseMatcher.add("Immeasurable Quantifiers", match_universal_quantifier,
                      *universalPatterns)

    temporalDependencies = [
        "eventually", "before", "when", "after", "as", "once", "earliest",
        "latest", "instantaneous", "simultaneous", "while", "at last"
    ]
    temporalPatterns = [nlp(text) for text in temporalDependencies]
    phraseMatcher.add("Temporal Dependencies", match_temporal,
                      *temporalPatterns)

    doc = nlp(inputText)
    matches = matcher(doc)
    lowercaseDoc = nlp(inputText.lower())
    phraseMatches = phraseMatcher(lowercaseDoc)
    match_ents.sort(key=lambda x: x["start"])
    return match_ents
Example #9
0
 def __init__(self, nlp, terms, label):
     patterns = [nlp(term) for term in terms]
     self.term_list = terms
     self.matcher = PhraseMatcher(nlp.vocab)
     self.matcher.add(label, None, *patterns)
Example #10
0
 def __init__(self, nlp, terms):
   self.terms = terms
   self.matcher = PhraseMatcher(nlp.vocab)
   patterns = [nlp.make_doc(text) for text in terms]
   self.matcher.add("TerminologyList", None, *patterns)
   Doc.set_extension("phrase_matches", getter=self.matcher, force=True)
Example #11
0
def create_web_dev_profile(file):
    text = pdfextract(file) 
    text = str(text)
    text = text.replace("\\n", "")
    text = text.lower()
    #below is the csv where we have all the keywords, you can customize your own
    keyword_dict = pd.read_csv('web_developer_keywords.csv')
    keyword_total = list(keyword_dict.count())
    global total_sum
    total_sum = 0
    for i in keyword_total:
        total_sum = total_sum + i
        
    print('ee',total_sum)
    
    front_end = [nlp(text) for text in keyword_dict['Front End'].dropna(axis = 0)]
    back_end = [nlp(text) for text in keyword_dict['Back End'].dropna(axis = 0)]
    database = [nlp(text) for text in keyword_dict['Database'].dropna(axis = 0)]
    project = [nlp(text) for text in keyword_dict['Projects'].dropna(axis = 0)]
    frameworks = [nlp(text) for text in keyword_dict['Frameworks'].dropna(axis = 0)]
    
    #print(front_end)
   # print(back_end)
    #print(database)
   
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('FrontEnd', None, *front_end)
    matcher.add('BackEnd', None, *back_end)
    matcher.add('Database', None, *database)
    matcher.add('Projects', None, *project)
    matcher.add('Frameworks', None, *frameworks)
 
    doc = nlp(text)
    #print(doc)
    
    d = []  
    matches = matcher(doc)
   # print(matches)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  # get the unicode ID, i.e. 'COLOR'
        span = doc[start : end]  # get the matched slice of the doc
        d.append((rule_id, span.text))      
    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
    
    ## convertimg string of keywords to dataframe
    df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1) 
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
    
    base = os.path.basename(file)
    filename = os.path.splitext(base)[0]
       
    name = filename.split('_')
    name2 = name[0]
    name2 = name2.lower()
    ## converting str to dataframe
    name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
    
    dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
    dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
    print(dataf)
    return(dataf)
Example #12
0
 def __get_country_matcher__(self):
     matcher = PhraseMatcher(self.nlp.vocab)
     countries = ['Czech Republic', 'Australia', 'Germany', 'Slovakia']
     patterns = list(self.nlp.pipe(countries))
     matcher.add('ANIMAL', None, *patterns)
     return matcher
Example #13
0
 def __get_animal_matcher__(self):
     matcher = PhraseMatcher(self.nlp.vocab)
     animals = ['dog', 'cat', 'mouse', 'dogs', 'cats', 'mice']
     patterns = list(self.nlp.pipe(animals))
     matcher.add('ANIMAL', None, *patterns)
     return matcher
print("Please wait whilst spaCy language library is loaded...")
nlp = spacy.load('en_core_web_md')
"""
//////////////////////////////////////////////////////
Change global values for bad words here
//////////////////////////////////////////////////////
"""
BAD_STEM_WORDS_LIST = [
    "you", "option", "accurate", "correct", "true", "can be", "only",
    "statement"
]
BAD_OPTION_WORDS_LIST = ["only", "statement", "all of the above"]

# Create spaCy PhraseMatchers (lowercase for case-insensitivity)
dnd_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
dnd_term = ["Drag and drop the"]
dnd_patterns = [nlp.make_doc(text) for text in dnd_term]
dnd_matcher.add("TerminologyList", None, *dnd_patterns)

canbe_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
canbe_term = ["can be"]
canbe_patterns = [nlp.make_doc(text) for text in canbe_term]
canbe_matcher.add("TerminologyList", None, *canbe_patterns)

negative_matcher = Matcher(nlp.vocab)
negative_matcher.add("NegativeList", None, [{
    'POS': 'VERB'
}, {
    'DEP': 'neg'
}], [{
Example #15
0
from spacy.matcher import Matcher
from spacy.matcher import PhraseMatcher
import datefinder
from .skills_extract import workex_extract_skills
#from dateparser.search import search_dates

edu_stop_heading = "skills|declaration|personal|education|academ|activities|projects|objective|professional|summary|background|internship|technical|activities|exposure|achievement"

base_path = os.path.dirname(__file__)

nlp = spacy.load('en_core_web_sm')

file = os.path.join(base_path, "titles_combined.txt")
file = open(file, "r", encoding='utf-8')
jobtitle = [line.strip().lower() for line in file]
jobtitlematcher = PhraseMatcher(nlp.vocab)
patterns = [
    nlp.make_doc(text) for text in jobtitle if len(nlp.make_doc(text)) < 10
]
jobtitlematcher.add("Job title", None, *patterns)


def extract_exp_section(terms, index_exp, heading_index):
    # temp_index_exp = index_exp
    # try:
    #     index_exp = heading_index.index(index_exp)
    # except:
    #     index_exp = 0
    # line_text = ""
    # try:
    #     if ((temp_index_exp+1) not in heading_index) or ((temp_index_exp+2) not in  heading_index ) or ((temp_index_exp+3) not in  heading_index )  :
Example #16
0
#
# show_ents(doc3)

# from spacy.tokens import Span
# doc = nlp(u'Tesla to build a BR factory for alot of money')
# ORG = doc.vocab.strings[u'ORG']
# print(ORG)
#
# print(doc.ents)

from spacy.tokens import Span
doc = nlp(u'Our company created a brand new vacuum cleaner This new vacuum-cleaner is the best in show'
       u'This new vacuum-cleaner is the best in show')
show_ents(doc)
from spacy.matcher import PhraseMatcher
encontrador = PhraseMatcher(nlp.vocab)
lista_frase = ['vacuum cleaner', 'vacuum-cleaner']
padroes_frase = [nlp(text) for text in lista_frase]
encontrador.add('novoproduto', None, *padroes_frase)
found_matches = encontrador(doc)
print(found_matches)
from spacy.tokens import Span
PROD = doc.vocab.strings[u'PRODUCT']#esse product é a tag da lista de tags que voce atribui para as palavras que quer adicionar
print(found_matches)
new_ents = [Span(doc, match[1], match[2], label=PROD) for match in found_matches]#aquei nos vamos atribuir a a match1 que eh o atributo 2 e o match2 que eh o terceiro atributo , sendo respectivamente onde começa e termina a palvra que queremos adicionar
doc.ents = list(doc.ents) + new_ents
show_ents(doc)
doc_encontra = nlp(u'Originally I paid $29.95 for this card, but now this card is much more expencive. It is now 50 dollars')
test = len([ent for ent in doc_encontra.ents if ent.label_ == 'MONEY'])
print(test)
Example #17
0
 def __init__(self, nlp, terms, label):
     patterns = [nlp.make_doc(text) for text in terms]
     self.matcher = PhraseMatcher(nlp.vocab)
     self.matcher.add(label, None, *patterns)
Example #18
0
# load pre-trained model
base_path = os.path.dirname(__file__)


nlp = spacy.load('en_core_web_sm')
custom_nlp2 = spacy.load(os.path.join(base_path,"degree","model"))
custom_nlp3 = spacy.load(os.path.join(base_path,"company_working","model"))

# initialize matcher with a vocab
matcher = Matcher(nlp.vocab)

file = os.path.join(base_path,"titles_combined.txt")
file = open(file, "r", encoding='utf-8')
designation = [line.strip().lower() for line in file]
designitionmatcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(text) for text in designation if len(nlp.make_doc(text)) < 10]
designitionmatcher.add("Job title", None, *patterns)

file = os.path.join(base_path,"LINKEDIN_SKILLS_ORIGINAL.txt")
file = open(file, "r", encoding='utf-8')    
skill = [line.strip().lower() for line in file]
skillsmatcher = PhraseMatcher(nlp.vocab)
patterns = [nlp.make_doc(text) for text in skill if len(nlp.make_doc(text)) < 10]
skillsmatcher.add("Job title", None, *patterns)


class resumeparse(object):

    objective = (
        'career goal',
    def informal_word_detection(self, sent_list):
        """detection and replacement of informal words with formal words"""
        # get the punctuations for the manipulation
        punctuation_list = string.punctuation
        # define matchers used for replacement purpose
        matcher_rule = Matcher(nlp.vocab)
        matcher_phrase = PhraseMatcher(nlp.vocab)
        # define different types of verbs
        verb_types = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
        # get the list of informal word list
        with open('Model/informal_word_list.txt', 'r') as file:
            informal_word_list = ["" + line.strip() + "" for line in file]
        # get the list of formal word list
        with open('Model/formal_word_list.txt', 'r') as file:
            formal_word_list = ["" + line.strip() + "" for line in file]

        phrase_list = list()
        for i in range(len(informal_word_list)):
            try:
                # get the words that matcher informal word list
                word = informal_word_list[i]
                # check whether the word length is 1 and it's a verb
                if len(word.split()) == 1 and str(
                        nlp(word)[0].tag_) in verb_types:
                    # apply the rule base matching
                    # get the base verb of the selected verb
                    pattern = [{'LEMMA': word}, {'IS_PUNCT': True, 'OP': '?'}]
                    # match with according to matcher_rule
                    matcher_rule.add(str(i), None, pattern)
                else:
                    # assign the words to the list(phrase_list) that need to formalize with phrase matching technique
                    phrase_list.append(word)
            except Exception:
                continue
        # tokenize the phrases
        phrase_patterns = [nlp(text) for text in phrase_list]
        # match with according to matcher_phrase concept - direct phrase replacement
        matcher_phrase.add('Informal word matcher', None, *phrase_patterns)

        for i in range(len(sent_list)):
            # sentence tokenized
            sentense = nlp(sent_list[i])
            # check for matching with respect to rule base technique in the sentence
            matches_1 = matcher_rule(sentense)
            # check for matching with respect to phrase base technique in the sentence
            matches_2 = matcher_phrase(sentense)
            # unit the two matches into a single
            matches = matches_1 + matches_2

            # sort the matches according to the occurrence of words in the original sentence
            # with the aim of preventing the complication due to availability of two matches
            matches.sort(key=lambda x: x[1])

            if len(matches) != 0:

                try:
                    new_sent = ""
                    # declare variable for later use
                    previous_end = None
                    # get match the informal word with formal word
                    for match in matches:
                        # get the informal word of the related match in sentence
                        informal_word = str(sentense[match[1]:match[2]])
                        # get the tag as word type - of single word match
                        word_type = str(sentense[match[1]:match[2]][0].tag_)
                        # as the informal word list is in base for check for the other possibilities of occurrence
                        # (verb types)
                        # if these conditions as match get them
                        if not informal_word_list.__contains__(
                                informal_word) and word_type in verb_types:
                            # get the index of the base form of those words in informal list
                            index = informal_word_list.index(
                                sentense[match[1]:match[2]][0].lemma_)
                            # get the respective formal word using index.
                            # convert that formal word into initial word_type as detected(tenses)
                            formal_word = getInflection(
                                formal_word_list[index], tag=str(word_type))[0]

                        # applies for the phrase base direct replacement
                        else:
                            index = informal_word_list.index(informal_word)
                            formal_word = formal_word_list[index]
                        # get the respective formal word upon the index

                        # if it indicates a new sentence.
                        if previous_end is None:
                            new_sent = new_sent + str(
                                sentense[:match[1]]).strip(
                                ) + " " + formal_word
                            # if next character is not a punctuation need to put a space
                            if len(sentense) != match[2] and str(sentense[
                                    match[2]]) not in punctuation_list:
                                new_sent = new_sent + " "
                                previous_end = match[2]
                            else:
                                previous_end = match[2]

                        else:

                            # continuation of sentence
                            new_sent = new_sent + str(
                                sentense[previous_end:match[1]]).strip(
                                ) + " " + formal_word
                            # if next character is not a punctuation need to put a space
                            if len(sentense) != match[2] and str(sentense[
                                    match[2]]) not in punctuation_list:
                                new_sent = new_sent + " "
                                previous_end = match[2]
                            else:
                                previous_end = match[2]

                    new_sent = new_sent + str(sentense[previous_end:]).strip()
                    sent_list[i] = new_sent.strip()
                except Exception:
                    sent_list[i] = str(sentense)
        # for sent in sent_list:
        #     print(sent)
        self.tense_conversion_obj.future_tense_det(sent_list)
    def __init__(self, nlp, ontoDict):
        # add ontology and label from ontoDict
        self.ontoDict = ontoDict
        self.all_labels = ""

        # stop words, don't try to match these
        stopwords = nlp.Defaults.stop_words
        stopwords.add("ands")
        stopwords.add("ends")
        stopwords.add("ci")

        self.ontols = []

        ontologies = ontoDict["ontologies"]
        for ontology in ontologies:
            for key, value in ontology.items():
                if (key == "label"):
                    self.all_labels = self.all_labels + value
                if (key == "ontology"):
                    self.ontols.append(value)
        # print("self.ontols: ", self.ontols)
        # for x in self.ontols:
        #     print("got x: ", x)
        # print("all_labels = ", self.all_labels)

        # for making plural forms of labels for text matching
        engine = inflect.engine()

        # init terms and patterns
        self.terms = {}
        patterns = []

        #build unified table of all ID, IRI, Label and Synonyms:
        for ontol in self.ontols:  #should be all ontols in
            print("checking ontol: ", ontol)
            for termid in ontol.get_classes():
                # print("k is: ", k)
                termshortid = ontol.get_id_for_iri(termid)

                label = ontol.get_annotation(termid, RDFSLABEL)
                definition = ontol.get_annotation(termid, DEFINITION)
                if label:
                    term_entry = {
                        'id': termid if termshortid is None else termshortid,
                        'name': label.strip(),
                        'definition': definition
                    }
                if label is not None and label.strip().lower(
                ) not in stopwords:
                    self.terms[label.strip().lower()] = term_entry
                    patterns.append(nlp.make_doc(label.strip().lower()))
                    plural = engine.plural(label.strip())
                    self.terms[plural.lower()] = term_entry
                    patterns.append(nlp.make_doc(plural.lower()))
                synonyms = ontol.get_annotations(termid, SYN)
                for s in synonyms:
                    # print("adding SYNONYM in ontotagtext: ", s)
                    if s.strip().lower() not in stopwords:
                        self.terms[s.strip().lower()] = term_entry
                        patterns.append(nlp.make_doc(s.strip().lower()))
                        try:
                            plural = engine.plural(s.strip().lower())
                            self.terms[plural.lower()] = term_entry
                            patterns.append(nlp.make_doc(plural.lower()))
                        except:
                            print("Problem getting plural of ", s)
                            continue

        # initialize matcher and add patterns
        self.matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
        self.matcher.add(self.all_labels, None, *patterns)

        # set extensions to tokens, spans and docs
        Token.set_extension("is_ontol_term", default=False, force=True)
        Token.set_extension("ontol_id", default=False, force=True)
        Token.set_extension("merged_concept", default=False, force=True)
        Doc.set_extension("has_ontols", getter=self.has_ontols, force=True)
        Doc.set_extension("ontols", default=[], force=True)
        Span.set_extension("has_ontols", getter=self.has_ontols, force=True)
Example #21
0
File: main.py Project: pchding/kph
def mainpipe(inputfile, search_term, max_records, json_out, embvec, embvecache,
             val_ratio, rnnsize, batchsize, lr, weight_decay, n_epochs,
             model_save, es):
    if inputfile == 1:
        with open("input.txt", "r") as f:
            para = ast.literal_eval(f.read())
        search_term = para['search_term']
        max_records = para['max_records']
        embvec = para['embvec']
        embvecache = para['embvecache']
        val_ratio = para['val_ratio']
        rnnsize = para['rnnsize']
        batchsize = para['batchsize']
        lr = para['lr']
        weight_decay = para['weight_decay']
        n_epochs = para['n_epochs']
        model_save = para['model_save']
    if embvec == 1:
        embvec = torchtext.vocab.GloVe(name='840B', dim=300, cache=embvecache)
        use_pretrained = True
    with mlflow.start_run() as mlrun:
        pubmed = PubMed(tool="AlphabetH", email="*****@*****.**")
        query = search_term
        results = pubmed.query(query, max_results=max_records)
        pp = defaultdict(lambda: defaultdict(dict))
        for art in results:
            pmed = art.pubmed_id
            try:
                pp[pmed]['title'] = art.title
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = art.abstract
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.conclusions
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.methods
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['abstract'] = pp[pmed]['abstract'] + art.results
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['keywords'] = art.keywords
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['authors'] = art.authors
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['journal'] = art.journal
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['pubdate'] = str(art.publication_date.year)
            except (AttributeError, TypeError):
                pass
            try:
                pp[pmed]['conclusions'] = art.conclusions
            except (AttributeError, TypeError):
                pass
        print(subprocess.getoutput("python -m spacy download en_core_web_sm"))
        artpd = pd.DataFrame.from_dict(pp, orient='index')
        artpda = artpd[artpd.abstract.notnull()].copy()
        artpda = artpda[artpd.title.notnull()]
        #        artpda.index = pd.Series(artpda.index).apply(lambda x: x[0:8])
        artpdak = artpda[artpda.keywords.str.len() > 0].copy()
        dataf = pd.DataFrame(
            index=artpdak.index,
            columns=['SRC', 'TRG', 'keywords', 'Extracted', 'abskey'])
        dataf.loc[:, 'SRC'] = artpdak.title + ' ' + artpdak.abstract
        dataf.loc[:, 'keywords'] = artpdak.keywords
        svoc = spacy.load("en_core_web_sm")
        matcher = PhraseMatcher(svoc.vocab, attr="LOWER")
        for pmid in dataf.index:
            t0 = dataf.loc[pmid]
            patterns = [svoc.make_doc(str(name)) for name in t0.keywords]
            matcher.add("Names", None, *patterns)
            doc = svoc(t0.SRC)
            t1 = ['O'] * (len(doc))
            matched = []
            matn = 0
            for _, start, end in matcher(doc):
                t1[start] = 'B'
                t1[start + 1:end] = 'I' * (end - start - 1)
                if str(doc[start:end]).lower() not in matched:
                    matn = matn + 1
                    matched.append(str(doc[start:end]).lower())
            abskw = []
            for x in t0.keywords:
                if x.lower() not in matched:
                    abskw.append(x)
            dataf.loc[pmid, 'TRG'] = ' '.join([t for t in t1])
            dataf.loc[pmid, 'Extracted'] = matn
            dataf.loc[pmid, 'abskey'] = abskw
            matcher.remove("Names")
        datatrain = dataf[dataf['Extracted'] >= 3].copy()
        datatest = dataf[dataf['Extracted'] < 3].copy()
        # separate train and validate
        dtrain = datatrain.loc[:, ['SRC', 'TRG']]
        dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']]
        seed = 250
        idx = np.arange(datatrain.shape[0])
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        np.random.shuffle(idx)
        val_size = int(len(idx) * val_ratio)
        df_train = dtrain.iloc[idx[val_size:], :]
        df_val = dtrain.iloc[idx[:val_size], :]
        df_val_k = dtraink.iloc[idx[:val_size], :]
        df_test = datatest.loc[:, ['SRC', 'TRG']]
        dtraink = datatrain.loc[:, ['SRC', 'TRG', 'keywords']]
        df_val_k = dtraink.iloc[idx[:val_size], :]
        # Load original dataset
        datai = artpda.copy()
        datai = datai[datai.abstract.notnull()]
        datai = datai[datai.title.notnull()]
        datai = datai.replace('\n', ' ', regex=True)
        datai = datai.replace('\t', ' ', regex=True)
        dataiu = datai.loc[datai.keywords.str.len() == 0]
        dataik = datai.loc[datai.keywords.str.len() > 0]
        dataiu['SRC'] = dataiu.title + ' ' + dataiu.abstract
        tokenizertrg = lambda x: x.split()

        def tokenizersrc(text):  # create a tokenizer function
            return [tok.text for tok in svoc.tokenizer(text)]

        def safe_value(field_val):
            return field_val if not pd.isna(field_val) else "Other"

        def safe_year(field_val):
            return field_val if not pd.isna(field_val) else 1900

        TEXT = torchtext.data.Field(init_token='<bos>',
                                    eos_token='<eos>',
                                    sequential=True,
                                    lower=False)
        LABEL = torchtext.data.Field(init_token='<bos>',
                                     eos_token='<eos>',
                                     sequential=True,
                                     unk_token=None)
        fields = [('text', TEXT), ('label', LABEL)]
        device = 'cuda'
        train_examples = read_data(df_train, fields, tokenizersrc,
                                   tokenizertrg)
        valid_examples = read_data(df_val, fields, tokenizersrc, tokenizertrg)
        # Load the pre-trained embeddings that come with the torchtext library.
        if use_pretrained:
            print('We are using pre-trained word embeddings.')
            TEXT.build_vocab(train_examples, vectors=embvec)
        else:
            print('We are training word embeddings from scratch.')
            TEXT.build_vocab(train_examples, max_size=5000)
        LABEL.build_vocab(train_examples)
        # Create one of the models defined above.
        #self.model = RNNTagger(self.TEXT, self.LABEL, emb_dim=300, rnn_size=128, update_pretrained=False)
        model0 = RNNCRFTagger(TEXT,
                              LABEL,
                              rnnsize,
                              emb_dim=300,
                              update_pretrained=False)

        model0.to(device)
        optimizer = torch.optim.Adam(model0.parameters(),
                                     lr=lr,
                                     weight_decay=weight_decay)
        train(train_examples, valid_examples, embvec, TEXT, LABEL, device,
              model0, batchsize, optimizer, n_epochs)
        out2 = evaltest2(df_val, df_val_k, model0, tokenizersrc, fields,
                         device)
        ttp3 = kphperct(df_val_k, out2, svoc)
        mlflow.log_param("epochs", n_epochs)
        mlflow.pytorch.save_model(model0, model_save)
        mlflow.log_metric("extraction_rate", ttp3.mean())
        augout = evaltest2(dataiu, model0, tokenizersrc, fields, device)
        klist = kphext2(dataiu.SRC, augout, svoc)
        for i in range(len(dataiu.index)):
            dataiu.iloc[i, 2].extend(list(set(klist[i])))
        output = pd.concat([dataik, dataiu], join="inner")
        output.to_json('/home/pding/OneDrive/kph/MSaug.json', orient='index')
        if es == 1:
            output['journal'] = output['journal'].apply(safe_value)
            output['conclusions'] = output['conclusions'].apply(safe_value)
            output['pubdate'] = output['pubdate'].apply(safe_year)
            output['PMID'] = output.index
            test_server = [{'host': '127.0.0.1', 'port': 9200}]
            es = Elasticsearch(test_server, http_compress=True)
            use_these_keys = [
                'PMID', 'title', 'abstract', 'keywords', 'authors', 'pubdate'
            ]

            def filterKeys(document):
                return {key: document[key] for key in use_these_keys}

            def doc_generator(df):
                df_iter = df.iterrows()
                for index, document in df_iter:
                    try:
                        yield {
                            "_index": 'ms',
                            "_source": filterKeys(document),
                        }
                    except StopIteration:
                        return

            helpers.bulk(es, doc_generator(output))
        print(ttp3.mean())
def test_issue4373():
    """Test that PhraseMatcher.vocab can be accessed (like Matcher.vocab)."""
    matcher = Matcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
    matcher = PhraseMatcher(Vocab())
    assert isinstance(matcher.vocab, Vocab)
def test_matcher_phrase_matcher(en_vocab):
    doc = Doc(en_vocab, words=["Google", "Now"])
    matcher = PhraseMatcher(en_vocab)
    matcher.add("COMPANY", None, doc)
    doc = Doc(en_vocab, words=["I", "like", "Google", "Now", "best"])
    assert len(matcher(doc)) == 1
Example #24
0
def tech_matcher_factory(nlp):
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
    patterns = [nlp.make_doc(text) for text in _technology_terms]
    matcher.add("Phrase Matching", None, *patterns)
    return matcher
def create_profile(file):
    text = pdfextract(file)
    text = str(text)
    text = text.replace("\\n", "")
    text = text.lower()
    #below is the csv where we have all the keywords, you can customize your own
    keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
    stats_words = [
        nlp(text) for text in keyword_dict['Statistics'].dropna(axis=0)
    ]
    NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis=0)]
    ML_words = [
        nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis=0)
    ]
    DL_words = [
        nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis=0)
    ]
    R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis=0)]
    python_words = [
        nlp(text) for text in keyword_dict['Python Language'].dropna(axis=0)
    ]
    Data_Engineering_words = [
        nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis=0)
    ]

    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('Stats', None, *stats_words)
    matcher.add('NLP', None, *NLP_words)
    matcher.add('ML', None, *ML_words)
    matcher.add('DL', None, *DL_words)
    matcher.add('R', None, *R_words)
    matcher.add('Python', None, *python_words)
    matcher.add('DE', None, *Data_Engineering_words)
    doc = nlp(text)

    d = []
    matches = matcher(doc)
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[
            match_id]  # get the unicode ID, i.e. 'COLOR'
        span = doc[start:end]  # get the matched slice of the doc
        d.append((rule_id, span.text))
    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i, j in Counter(d).items())

    ## convertimg string of keywords to dataframe
    df = pd.read_csv(StringIO(keywords), names=['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ', 1).tolist(),
                       columns=['Subject', 'Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(', 1).tolist(),
                       columns=['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'], df2['Keyword'], df2['Count']], axis=1)
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))

    base = os.path.basename(file)
    filename = os.path.splitext(base)[0]

    name = filename.split('_')
    name2 = name[0]
    name2 = name2.lower()
    ## converting str to dataframe
    name3 = pd.read_csv(StringIO(name2), names=['Candidate Name'])

    dataf = pd.concat([
        name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']
    ],
                      axis=1)
    dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0],
                                   inplace=True)

    return (dataf)
Example #26
0
def create_profile(file):

    text = getDocxContent(file)
    text = str(text)
    text = text.replace("\\n", "")
    text = text.lower()

    #below is the csv where we have all the keywords, you can customize your own

    keyword_dict = pd.read_csv(
        'D:/eclipse-workspace/ResumeParserUtilty/DataDictionary/AutomationProfileSearch.csv'
    )
    AutomationTool = [
        nlp(text) for text in keyword_dict['Automation tools'].dropna(axis=0)
    ]

    java_words = [
        nlp(text) for text in keyword_dict['Java Language'].dropna(axis=0)
    ]

    bigdata_words = [
        nlp(text) for text in keyword_dict['Big Data'].dropna(axis=0)
    ]

    JS_words = [
        nlp(text) for text in keyword_dict['JS Lanaguage'].dropna(axis=0)
    ]

    python_words = [
        nlp(text) for text in keyword_dict['Python Language'].dropna(axis=0)
    ]

    Data_Engineering_words = [
        nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis=0)
    ]

    Bug_words = [
        nlp(text) for text in keyword_dict['Bug Tracking Tools'].dropna(axis=0)
    ]

    test_words = [
        nlp(text)
        for text in keyword_dict['Test Management Tool'].dropna(axis=0)
    ]

    Database_words = [
        nlp(text) for text in keyword_dict['DataBase'].dropna(axis=0)
    ]

    matcher = PhraseMatcher(nlp.vocab)

    matcher.add('AutoTool', None, *AutomationTool)

    matcher.add('JAVA', None, *java_words)

    matcher.add('BigData', None, *bigdata_words)

    matcher.add('JS', None, *JS_words)

    matcher.add('Python', None, *python_words)

    matcher.add('DE', None, *Data_Engineering_words)

    matcher.add('JIRA', None, *Bug_words)

    matcher.add('TM', None, *test_words)

    matcher.add('DB', None, *Database_words)

    doc = nlp(text)

    d = []

    matches = matcher(doc)

    for match_id, start, end in matches:

        rule_id = nlp.vocab.strings[
            match_id]  # get the unicode ID, i.e. 'COLOR'

        span = doc[start:end]  # get the matched slice of the doc

        d.append((rule_id, span.text))

    keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i, j in Counter(d).items())

    ## convertimg string of keywords to dataframe

    df = pd.read_csv(StringIO(keywords), names=['Keywords_List'])
    df1 = pd.DataFrame(df.Keywords_List.str.split(' ', 1).tolist(),
                       columns=['Subject', 'Keyword'])
    df2 = pd.DataFrame(df1.Keyword.str.split('(', 1).tolist(),
                       columns=['Keyword', 'Count'])
    df3 = pd.concat([df1['Subject'], df2['Keyword'], df2['Count']], axis=1)
    df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))

    base = os.path.basename(file)

    filename = os.path.splitext(base)[0]

    name = filename.split('_')

    name2 = name[0]

    name2 = name2.lower()

    ## converting str to dataframe

    name3 = pd.read_csv(StringIO(name2), names=['Candidate Name'])

    dataf = pd.concat([
        name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']
    ],
                      axis=1)
    dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0],
                                   inplace=True)

    return (dataf)
Example #27
0
def patternSearch(T_0, T, file, scoring_mode):
    current_patterns = [nlp(x) for x in T]
    phrase_matcher = PhraseMatcher(nlp.vocab)
    phrase_matcher.add('pattern search', None, *current_patterns)
    unranked_patterns = []
    # find occurrences of seed phrases
    with open(file, "r") as f:
        file_chunk = partition(f)
        for document in file_chunk:
            print(len(document))
            document = nlp(document)
            phrase_patterns = set()
            matches = phrase_matcher(document)
            for match_id, start, end in matches:
                p = tuple((start, end))
                if p not in phrase_patterns:
                    phrase_patterns.add(p)
    # find patterns around seed phrases
            for phrase_pattern in phrase_patterns:
                start = phrase_pattern[0]
                end = phrase_pattern[1]
                if (document[start - 1].text == '\n'):
                    continue
                # add context pattern
                tmp = []
                for i in range(2, 0, -1):
                    if document[start - 1].tag_ == "IN":
                        tmp.append({"TEXT": document[start - 1].text})
                        break
                    tmp.append({"TEXT": document[start - i].text})
                # add content pattern
                span = document[start:end]
                for token in span:
                    tmp.append({"POS": token.pos_})
                if tmp not in unranked_patterns:
                    unranked_patterns.append(tmp)
    unranked_phrases = list(getPhrases(file, unranked_patterns))

    l1, l2, l3, l4, m1, m2, m3, m4 = run_prdualrank(T_0, unranked_patterns, unranked_phrases, file)

    expanded_pattern_pre = [unranked_patterns[i] for i in l1]
    expanded_pattern_rec = [unranked_patterns[i] for i in l2]
    expanded_eid_pre = [unranked_phrases[i] for i in l3]
    expanded_eid_rec = [unranked_phrases[i] for i in l4]

    pattern2fscore = {}
    for i in range(len(unranked_patterns)):
        recall = m2[i]
        precision = m1[i]
        fscore = 0
        if scoring_mode == 0:
            if (recall + precision) == 0:
                fscore = 0
            else:
                fscore = ((2 * recall * precision) / (recall + precision))
        elif scoring_mode == 1:
            fscore = precision
        elif scoring_mode == 2:
            fscore = recall
        elif scoring_mode == 3:
            fscore = precision * recall
        elif scoring_mode == 4:
            fscore = precision + recall
        else:
            fscore = -100
        pattern2fscore[i] = fscore
    sorted_patterns_ids = sorted(pattern2fscore, key=pattern2fscore.__getitem__, reverse=True)
    sorted_patterns = [unranked_patterns[i] for i in sorted_patterns_ids]

    return sorted_patterns
def getsentencetense(doc,tense_uses):
    material_patterns = [nlp(text) for text in tense_uses]
    matcher = PhraseMatcher(nlp.vocab)
    matcher.add('tense', None, *material_patterns)
    matches = matcher(doc)
    return len(matches)!=0
Example #29
0
if res_cat is not None:
    st.header(f"CLASSIFICATION")
    res_cat = res_cat.get('result')[0]
    st.subheader(f"> {res_cat.get('category').capitalize()} ({res[0].get('result')[0].get('score')})")

# Extracted Entities
if res_ner is not None and len(res_ner.get('result')) > 0:
    st.header("NAMED ENTITIES")
    df_ner = pd.read_json(json.dumps(res_ner.get('result')))
    # Get value pairs as dict
    entity_names = {x:[] for x in df_ner.label}
    for x, y in zip(df_ner.label,df_ner.value):
        entity_names[x].append(y)
    entities = [*entity_names]
    # Create matcher
    matcher = PhraseMatcher(nlp.vocab)
    for key, value in entity_names.items():
        patterns = [nlp(entity) for entity in value] 
        matcher.add(key, None, *patterns)
    doc.ents = [ent for ent in list(doc.ents) if ent.label_ in entity_names]
    matches = matcher(doc)

    # Get matches in text
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]  
        span = doc[start : end]
    
    # Transform to spans and check for duplicates
    starts = []
    for match_id, start, end in matches:
        rule_id = nlp.vocab.strings[match_id]
Example #30
0
def test_phrase_matcher_basic_check(en_vocab):
    matcher = PhraseMatcher(en_vocab)
    # Potential mistake: pass in pattern instead of list of patterns
    pattern = Doc(en_vocab, words=["hello", "world"])
    with pytest.raises(ValueError):
        matcher.add("TEST", pattern)