Python Parser.parse_plain_text Examples

Programming Language: Python

Namespace/Package Name: parser

Class/Type: Parser

Method/Function: parse_plain_text

Examples at hotexamples.com: 1

Python Parser.parse_plain_text - 1 examples found. These are the top rated real world Python examples of parser.Parser.parse_plain_text extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Parser(30)

__init__(15)

parse_operand(15)

scan(6)

parse_from_re(6)

output(3)

read_next_case(3)

get_dataset(3)

open(3)

get_organization(3)

stem_sentence_porter(3)

generate_ci_model_from_xml_string(2)

search(2)

generate_values(2)

getArqs(2)

get_datasource(2)

printer(2)

processInput(2)

parse_member_declaration(2)

parse_css(2)

get_file_name(2)

compute(2)

parse_files(2)

get_links(2)

get_serverdata_for_connection(2)

get_yr_parser(2)

validate_and_parse(2)

_advance(2)

_commandRecognized(2)

get_values_for_query(2)

get_user(2)

get_today_tasks(2)

get_slices(2)

get_simple_indicators(2)

start_paser(2)

get_meta_data(2)

get_profit_rate(2)

Parse(2)

add_visitor(2)

get_observations(2)

parseFAZ(1)

parse_class_or_interface_declaration(1)

inflect(1)

parseURLText(1)

parseQuery(1)

parseMessage(1)

inputMonth(1)

is_number(1)

parseIntputString(1)

parseDataset(1)

Example #1

Show file

File: input_processor.py Project: matzika/article-tagger-system

class Comparator(object):
    def __init__(self, simdir):
        self.parser = Parser(None)
        # read similar words dictionary
        self.similar_words = json.load(open(simdir))

    def find_similarities(self):
        self.detect_tags()
        i_words = self.input.split(" ")
        similarities = {}

        for iw in i_words:
            if iw in self.similar_words:
                for s in self.similar_words[iw]:
                    if not s[0] in similarities:
                        similarities[s[0]] = s[1]
                    else:
                        similarities[s[0]] += s[1]

        sorted_similarities = sorted(similarities.items(), key=operator.itemgetter(1), reverse=True)

        return sorted_similarities

    def detect_tags(self, input_text):
        tags = {}

        people = {}
        locations = {}
        organizations = {}
        concepts = {}

        print ("Parse Article")
        self.input = self.parser.parse_plain_text(input_text)
        print ("Find Entities in the Article")
        self.entities = self.parser.extract_entities(input_text)
        print "Entities: \n" + str(self.entities)
        for ent in self.entities:
            if self.entities[ent] == "PERSON":
                entity = ent
                people[entity] = 10
            elif self.entities[ent] == "GPE" or self.entities[ent] == "GSP":
                entity = ent
                locations[entity] = 10
            elif self.entities[ent] == "FACILITY" or self.entities[ent] == "ORGANIZATION":
                entity = ent
                organizations[entity] = 10

        print ("Find Frequent Words")
        words = self.calc_freq_words(self.input)
        print ("Find Important Words")
        important_words = self.find_important_words(input_text)
        for word in words:
            if word[1] == 1:
                break  # too infrequent to care

            score = word[1]
            # count frequency
            if word[0] in important_words:
                score *= 2  # double if in the nutgraph of the article
            if word[0] in self.similar_words:

                for s in self.similar_words[word[0]]:

                    if "P" in s[0]:
                        if "O" in s[0]:
                            s[0] = re.sub("/O", "", s[0])
                        if "L" in s[0]:
                            s[0] = re.sub("/L", "", s[0])
                        stripped = re.sub("/P", "", s[0])

                        if stripped in people:
                            people[stripped] += s[1] + score

                        else:
                            people[stripped] = score
                    elif "L" in s[0]:
                        if "O" in s[0]:
                            s[0] = re.sub("/O", "", s[0])
                        stripped = re.sub("/L", "", s[0])

                        if stripped in locations:
                            locations[stripped] += s[1] + score

                        else:
                            locations[stripped] = score
                    elif "O" in s[0]:
                        stripped = re.sub("/O", "", s[0])

                        if stripped in organizations:
                            organizations[stripped] += s[1] + score

                        else:
                            organizations[stripped] = score
                    else:
                        if s[0] in concepts:
                            concepts[s[0]] += s[1] + score
                        else:
                            concepts[s[0]] = score

        print ("Sort Entities and Concepts")
        sorted_people = sorted(people.items(), key=operator.itemgetter(1), reverse=True)
        sorted_orgs = sorted(organizations.items(), key=operator.itemgetter(1), reverse=True)
        sorted_locations = sorted(locations.items(), key=operator.itemgetter(1), reverse=True)
        sorted_concepts = sorted(concepts.items(), key=operator.itemgetter(1), reverse=True)

        tags["people"] = []
        for sp in sorted_people:
            tags["people"].append(sp[0])

        tags["locations"] = []
        for sp in sorted_locations:
            tags["locations"].append(sp[0])

        tags["organizations"] = []
        for sp in sorted_orgs:
            tags["organizations"].append(sp[0])

        tags["concepts"] = []
        for sp in sorted_concepts:
            tags["concepts"].append(sp[0])
            if len(tags["concepts"]) > 500:
                break

        return tags

    def calc_freq_words(self, input_text):
        vocab = {}
        for word in input_text.split(" "):
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1

        sorted_freq_words = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)

        return sorted_freq_words

    def find_important_words(self, input_text):
        """Detect entities that appear in the article's nutgraph"""
        words = input_text.split()
        limit = int(len(words) / 4)

        nutgraph = self.parser.parse_plain_text(" ".join(words[0:limit]))
        nutgraph_entities = self.parser.extract_entities(" ".join(words[0:limit]))

        important = []

        for w in nutgraph.split(" "):
            if w in nutgraph_entities:
                if nutgraph_entities[w] == "PERSON":
                    w = w + "P"
                elif nutgraph_entities[w] == "GPE" or nutgraph_entities[w] == "GSP":
                    w = w + "L"
                elif nutgraph_entities[w] == "FACILITY" or nutgraph_entities[w] == "ORGANIZATION":
                    w = w + "O"

            important.append(w)

        return important