def annotate(text, properties_step):
    '''
    use NLPcore to annotate text and extract relations
    :param text: list of sentences, a list of a str
    :param properties_step: 1 or 2, the first step or the second step
    :return: annotated document
    '''
    properties_1 = {
        "annotators": "tokenize,ssplit,pos,lemma,ner",
        "ner.useSUTime": "0"
    }

    properties_2 = {
        "annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation",
        # Second pipeline; leave out parse,relation for first
        "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
        # Must be present for the second pipeline!
        "ner.useSUTime": "0"
    }
    nlpcore_path = os.path.abspath("stanford-corenlp-full-2017-06-09")
    # nlpcore_path = "/Users/vibrioh/local_projects/stanford-corenlp-full-2017-06-09"

    clinet = NLPCoreClient(nlpcore_path)
    if properties_step == 1:
        pipeline = properties_1
    else:
        pipeline = properties_2
    doc = clinet.annotate(text, pipeline)

    return doc
Esempio n. 2
0
    def __init__(self):
        self.relation_map = {
            1: "Live_In",
            2: "Located_In",
            3: "OrgBased_In",
            4: "Work_For"
        }

        self.entity_map = {
            1: ['PEOPLE', 'LOCATION'],
            2: ['LOCATION', 'LOCATION'],
            3: ['ORGANIZATION', 'LOCATION'],
            4: ['PEOPLE', 'ORGANIZATION']
        }

        self.queries = set()

        # (word1, word2) -> (entity1, entity2, relation, prob)
        self.X = {}

        # Parameters from input
        self.SEARCH_JSON_API_KEY = ""
        self.SEARCH_ENGINE_ID = ""
        self.RELATION = 0
        self.THRESHOLD = 0
        self.QUERY = ""
        self.k = 0

        # Retrieved set
        self.retrieved_url = set()

        # Client
        self.client = NLPCoreClient(
            os.path.abspath("stanford-corenlp-full-2017-06-09"))
Esempio n. 3
0
def tag_relations(phrases):
    client = NLPCoreClient(STANFORD_CORENLP_PATH)
    properties = {
        "annotators": "",
        "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
        "ner.useSUTime": "0"
    }

    # annotate second pipeline
    properties["annotators"] = "tokenize,ssplit,pos,lemma,ner,parse,relation"
    doc = client.annotate(text=phrases, properties=properties)

    # Iterate through all relations, evaluate and print and record
    relations = []
    for sentence in doc.sentences:
        relations.extend(record_relations(sentence))

    return relations
Esempio n. 4
0
def find_query_term_occurrences(text):
    """Annotate text with the Stanford CoreNLP."""
    client = NLPCoreClient(STANFORD_CORENLP_PATH)
    properties = {
        "annotators": "",
        "parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
        "ner.useSUTime": "0"
    }

    # annotate first pipeline
    properties["annotators"] = "tokenize,ssplit,pos,lemma,ner"
    doc = client.annotate(text=text, properties=properties)

    # find sentences with matching tokens from query
    eligiblePhrases = []
    for sentence in doc.sentences:
        s = eval_sentence(sentence)
        if s is not False:
            eligiblePhrases.append(s)

    return eligiblePhrases
Esempio n. 5
0
    def __init__(self, lib):
        self.client = NLPCoreClient(lib)
        # some default parameters to feed the "annotator"
        self.annotators = "tokenize,ssplit,pos,lemma,ner,parse,relation"
        self.parsemodel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
        self.useSUTime = "0"
        # used for filtering sentences that contain certain named entities like 'PERSON'
        # e.g. if relation is 'Work_In' then there must be at least one entity of 'PERSON'
        # and one of 'ORGANIZATION' in the sentence
        self.entity_filters = {
            "Live_In": lambda x: x['PERSON'] > 0 and x['LOCATION'] > 0,
            "Located_In": lambda x: x['LOCATION'] > 1,
            "OrgBased_In":
            lambda x: x['LOCATION'] > 0 and x['ORGANIZATION'] > 0,
            "Work_For": lambda x: x['PERSON'] > 0 and x['ORGANIZATION'] > 0
        }

        if DEBUG:
            # cache the parsed results under a local directory for faster debugging
            self.cache_dir = "parsed"
            if self.cache_dir and not os.path.exists(self.cache_dir):
                os.makedirs(self.cache_dir)
                os.makedirs(self.cache_dir + '/1')
                os.makedirs(self.cache_dir + '/2')
Esempio n. 6
0
class NLPParser(object):

    ## the constructor
    #  @param lib the root path of Stanford NLP tools, type: str
    def __init__(self, lib):
        self.client = NLPCoreClient(lib)
        # some default parameters to feed the "annotator"
        self.annotators = "tokenize,ssplit,pos,lemma,ner,parse,relation"
        self.parsemodel = "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"
        self.useSUTime = "0"
        # used for filtering sentences that contain certain named entities like 'PERSON'
        # e.g. if relation is 'Work_In' then there must be at least one entity of 'PERSON'
        # and one of 'ORGANIZATION' in the sentence
        self.entity_filters = {
            "Live_In": lambda x: x['PERSON'] > 0 and x['LOCATION'] > 0,
            "Located_In": lambda x: x['LOCATION'] > 1,
            "OrgBased_In":
            lambda x: x['LOCATION'] > 0 and x['ORGANIZATION'] > 0,
            "Work_For": lambda x: x['PERSON'] > 0 and x['ORGANIZATION'] > 0
        }

        if DEBUG:
            # cache the parsed results under a local directory for faster debugging
            self.cache_dir = "parsed"
            if self.cache_dir and not os.path.exists(self.cache_dir):
                os.makedirs(self.cache_dir)
                os.makedirs(self.cache_dir + '/1')
                os.makedirs(self.cache_dir + '/2')
        # END - if DEBUG

    ## calling the annotator
    #  @param lines texts to process, type: list(str)
    #  @param annotators specific annotators, type: str
    #  @param chunk size of chunk to process in one batch, type: int
    #
    #  @ret list of data.Sentence instances, each representing a parsed sentence
    def annotate(self, lines, annotators=None, chunk=1e6):
        if not annotators:
            annotators = self.annotators
        properties = {
            "annotators": annotators,
            "parse.model": self.parsemodel,
            "ner.useSUTime": self.useSUTime
        }

        # split the processing into chunck to avoid memory overflowing,
        # or generating gigantic temp files (like the input.txt.html)
        # it might not be necessary so I leave the options here
        sentences = []
        i = 0
        chunk = int(chunk)
        while i < len(lines):
            j = min(len(lines), i + chunk)
            if i >= j:
                break
            # calling annotator, a Python-wrapped Java codes
            document = self.client.annotate(text=lines[i:j],
                                            properties=properties)
            sentences.extend(document.sentences)
            i = j
        return sentences

    ## first round of parsing, screening sentences with relevant named entities
    #  @param key key to the document, for caching, type: str
    #  @param lines texts to process, type: list(str)
    #  @param relation relation of interest (e.g. Work_In), type: str
    #
    #  @ret list of sentences in pure text
    def __first_round(self, key, lines, relation):
        if DEBUG:
            # use hash value of url + relation string to name the cache file
            fname = "{}/1/{}.txt".format(self.cache_dir,
                                         abs(hash(key + relation)))
            if os.path.exists(fname):
                with open(fname, 'r') as f:
                    return [l.rstrip('\n') for l in f]
        # END - if DEBUG

        res = []
        if not relation in self.entity_filters:
            return res
        entity_filter = self.entity_filters[relation]

        # calling a selection of annotators (no parsing or relation)
        sentences = self.annotate(lines, "tokenize,ssplit,pos,lemma,ner")

        for sentence in sentences:
            if len(sentence.tokens) >= 50:
                ##
                # sometimes the scrapper returns very long sentences that are
                # very computational expensive but usually not as productive in
                # generating relation tuples, the sentence length limit of 50
                # words comes after several experiments on the trade-off of
                # performance and correctness
                continue

            # count number of named entities and filter sentences accordingly
            entity_counts = defaultdict(int)
            for token in sentence.tokens:
                entity_counts[token.ner] += 1
            # filter by named entity counts
            if entity_filter(entity_counts):
                line = u' '.join([token.word for token in sentence.tokens
                                  ]).encode('ascii',
                                            'ignore').replace('|', '')
                res.append(line)

        if DEBUG:
            with open(fname, 'w') as f:
                for line in res:
                    f.write(line + '\n')
        # END - if DEBUG

        return res

    ## second round of parsing, get the relations
    #  @param key key to the document, for caching, type: str
    #  @param lines texts to process, type: list(str)
    #  @param relation relation of interest (e.g. Work_In), type: str
    #
    #  @ret list of relation descriptions (entity#1 value, entity#2 value,
    #       entity#1 type, entity#2 type, confidence, sentence text), type: tuple
    def __second_round(self, key, lines, relation):
        if DEBUG:
            # use hash value of url + relation to name the cache file
            fname = "{}/2/{}.txt".format(self.cache_dir,
                                         abs(hash(key + relation)))
            if os.path.exists(fname):
                with open(fname, 'r') as f:
                    return [tuple(l.rstrip('\n').split('|')) for l in f]
        # END - if DEBUG

        # calling a full set of annotators (default)
        sentences = self.annotate(lines)

        res = []
        for sentence in sentences:
            raw = ""  # raw text of the sentence
            for rel in sentence.relations:
                # each pair of relation in the sentence contains a relation type,
                # its probability (confidence), and a pair of entities with value and type.
                # skip the relationship pair if the confidence of the relation we are looking
                # for is not the highest among all relations
                probabilities = rel.probabilities
                if float(probabilities.get(relation, -1)) < max(
                        map(float, probabilities.values())):
                    continue

                e = rel.entities
                if len(e) == 2:
                    if not raw:
                        # construct the raw text of sentence now by
                        # joining the "word"s of its tokens
                        raw = u' '.join([t.word for t in sentence.tokens
                                         ]).encode('ascii', 'ignore')
                    # append the relation description as a tuple to the results
                    res.append((e[0].value.rstrip(), e[1].value.rstrip(), \
                        e[0].type.rstrip(), e[1].type.rstrip(), \
                        probabilities[relation], raw))

        if DEBUG:
            with open(fname, 'w') as f:
                for line in res:
                    f.write("|".join(map(str, line)) + '\n')
        # END - if DEBUG

        return res

    ## extract relation tuples from a search document
    #  @param doc the search document with scraped text, type: SearchDocument
    #  @param relation relation of interest (e.g. Work_In), type: str
    #
    #  @ret list of relation tuples, type: list(RelationTuple)
    def extract_relation(self, doc, relation):
        key = doc.key

        # first round, screen sentences
        lines = self.__first_round(key, doc.text.split('\n'), relation)

        # second round, get relations
        relations = self.__second_round(key, lines, relation)

        # combine relation tuples with same entities
        res = {}
        for t in relations:
            (v0, v1, t0, t1, prob, sentence) = t
            rt = RelationTuple(v0, v1, t0, t1, prob, sentence, relation)
            key = hash(rt)
            if not key in res or res[key] < rt:
                # new tuple or better than existing, add to results
                res[key] = rt
        return res.values()
Esempio n. 7
0
class InformationExtractionEngine:
    def __init__(self):
        self.relation_map = {
            1: "Live_In",
            2: "Located_In",
            3: "OrgBased_In",
            4: "Work_For"
        }

        self.entity_map = {
            1: ['PEOPLE', 'LOCATION'],
            2: ['LOCATION', 'LOCATION'],
            3: ['ORGANIZATION', 'LOCATION'],
            4: ['PEOPLE', 'ORGANIZATION']
        }

        self.queries = set()

        # (word1, word2) -> (entity1, entity2, relation, prob)
        self.X = {}

        # Parameters from input
        self.SEARCH_JSON_API_KEY = ""
        self.SEARCH_ENGINE_ID = ""
        self.RELATION = 0
        self.THRESHOLD = 0
        self.QUERY = ""
        self.k = 0

        # Retrieved set
        self.retrieved_url = set()

        # Client
        self.client = NLPCoreClient(
            os.path.abspath("stanford-corenlp-full-2017-06-09"))

    def find_entities(self, sentences):
        """
        Find the name entities in a list of sentence, serving as the first pipine
        :return: list of entities
        """
        entities = []

        properties = {
            "annotators": "tokenize,ssplit,pos,lemma,ner",
            "parse.model":
            "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
            "ner.useSUTime": "0"
        }
        doc = self.client.annotate(text=sentences, properties=properties)

        for sentence in doc.sentences:
            for token in sentence.tokens:
                if token.ner != 'O':
                    entities.append(token.ner)

        return entities

    def extract_relation_from_page(self, sentences):
        """
        Extract relation from sentence list(retrieved from a webpage), if satisfy condition(threshold and RELATION type), update relations
        :return: void
        """
        properties = {
            "annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation",
            "parse.model":
            "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
            "ner.useSUTime": "0"
        }
        doc = self.client.annotate(text=sentences, properties=properties)

        # print doc.tree_as_string()

        for sentence in doc.sentences:
            for relation in sentence.relations:
                try:
                    probs = relation.probabilities
                    # Find max relation and corresponding max prob
                    max_relation = max(probs.iterkeys(),
                                       key=(lambda key: probs[key]))
                    max_prob = probs[max_relation]
                    # If max prob over threshold and max relation equals to curr relation, update relations
                    if max_prob > self.THRESHOLD and max_relation == self.relation_map[
                            self.RELATION]:
                        key = (relation.entities[0].value,
                               relation.entities[1].value)

                        # judge the type of inputs
                        if float(max_prob) >= float(
                                self.THRESHOLD
                        ) and self.entity_map[
                                self.RELATION][0] == relation.entities[
                                    0].type and self.entity_map[self.RELATION][
                                        1] == relation.entities[1].type:
                            if key not in self.X or float(
                                    self.X[key][3]) < float(max_prob):
                                self.X[key] = (relation.entities[0].type,
                                               relation.entities[1].type,
                                               max_relation, max_prob)
                                print "Sentence:",
                                for token in sentence.tokens:
                                    print " " + token.word,
                                print ""

                                print "Relation Type: {0:10}| Confidence: {1:.3f}  | EntityType1: {2:15} | EntityValue1: {3:15} | EntityType2: {4:15} | EntityValue2: {5:15}".format(
                                    max_relation, float(max_prob),
                                    relation.entities[0].type,
                                    relation.entities[0].value,
                                    relation.entities[1].type,
                                    relation.entities[1].value)
                except:
                    pass

    def extract_relation(self):
        """
        This function executes each iteration
        1. Google Search and get a list of sentences for each url
        2. First pipeline, find entities and identify page
        3. If pass, second pipeline, extract relation and update relations
        :return:
        """
        web_pages = self.google_search()
        # print "Web Page Retrieved"
        for page in web_pages:
            entities = self.find_entities(page)
            # print entities
            if self.identity_page(entities):
                # print "Page Identified"
                self.extract_relation_from_page(page)
                # print "Page Processed"

    def identity_page(self, entities):
        """
        Check entities from one retrieved web page contains required entities for its r
        :return: boolean
        """
        if self.RELATION == 1:
            # Live_In
            if 'PERSON' in entities and 'LOCATION' in entities:
                return True
            else:
                return False
        elif self.RELATION == 2:
            # Located_In
            location_count = 0
            for entity in entities:
                if entity == 'LOCATION':
                    location_count += 1
            if location_count < 2:
                return False
            else:
                return True
        elif self.RELATION == 3:
            # OrgBased_In
            if 'ORGANIZATION' in entities and 'LOCATION' in entities:
                return True
            else:
                return False
        elif self.RELATION == 4:
            # Work_For
            if 'ORGANIZATION' in entities and 'PERSON' in entities:
                return True
            else:
                return False

    def google_search(self):
        """
        Return the Top-10 results of Google search using QUERY
        :return: list
        """
        results = []

        # Google search
        url = "https://www.googleapis.com/customsearch/v1?key=" + self.SEARCH_JSON_API_KEY + "&cx=" + self.SEARCH_ENGINE_ID + "&q=" + self.QUERY
        response = requests.get(url)
        search_results = json.loads(response.text)['items']

        # Retrieve each url and extract plain text
        for item in search_results:
            item_url = item['link']
            if item_url not in self.retrieved_url:
                try:
                    text = self.extract_text_from_page(item_url)
                    results.append(text)
                except:
                    pass
            self.retrieved_url.add(item_url)

        return results

    def extract_text_from_page(self, url):
        """
        Extract plain text from a web page pointed by url
        :return: list of sentences(str)
        """
        response = requests.get(url)
        soup = BeautifulSoup(response.text, "html.parser")

        # Kill all script ans style elements
        for script in soup(["script", "style"]):
            script.extract()

        # Get text
        text = soup.get_text()

        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        chunks = (phrase.strip() for line in lines
                  for phrase in line.split("  "))
        # text = [chunk.decode('unicode_escape').encode('ascii','ignore') + "." for chunk in chunks if chunk]
        text = [chunk.encode('utf-8') + "." for chunk in chunks if chunk]

        sentences = []
        for t in text:
            sentences.extend(re.split(r' *[\.\?!][\'"\)\]]* *', t))
        results = [sentence + "." for sentence in sentences]

        return results

    def read_parameters(self):
        """
        Read parameters from command line and assign to global vars
        :return: void
        """
        inputs = sys.argv

        if len(inputs) < 7:
            self.usage()
            sys.exit(1)

        self.SEARCH_JSON_API_KEY = inputs[1]
        self.SEARCH_ENGINE_ID = inputs[2]
        self.RELATION = int(inputs[3])
        self.THRESHOLD = float(inputs[4])
        self.QUERY = inputs[5]
        self.k = int(inputs[6])

        # Print to console
        print("Search Key: " + self.SEARCH_JSON_API_KEY)
        print("Search Engine ID: " + self.SEARCH_ENGINE_ID)
        print("RELATION: " + self.relation_map[self.RELATION])
        print("THRESHOLD: " + str(self.THRESHOLD))
        print("QUERY: " + self.QUERY)
        print("# of Tuples: " + str(self.k))

    def usage(self):
        sys.stderr.write(
            """Usage: python main.py <SEARCH_JSON_API_KEY> <SEARCH_ENGINE_ID> <RELATION> <THRESHOLD> <QUERY> <k>.\n"""
        )

    def run(self):
        self.read_parameters()
        ind = 1
        while len(self.X) < self.k:
            print("Iteration " + str(ind) + ": query - " + self.QUERY)
            self.extract_relation()
            sorted_X = sorted(self.X.items(),
                              key=lambda (k, v): v[3],
                              reverse=True)
            print("=====Relations=====")
            count = 0
            for t in sorted_X:
                count += 1
                if count > self.k:
                    break
                print "Relation Type: {0:10}| Confidence: {1:.3f}  | Entity #1: {2:15}| Entity #2: {3:15}".format(
                    self.relation_map[self.RELATION], float(t[1][3]), t[0][0],
                    t[0][1])

            for t in sorted_X:
                temp_query = t[0][0] + " " + t[0][1]
                if not temp_query in self.queries:
                    self.queries.add(temp_query)
                    break

            # Check if enters infinite loop
            if self.QUERY == temp_query:
                sys.stderr.write(
                    """Can not find enough relation instances, program ends. Please try another set of parameters.\n"""
                )
                sys.exit(1)
            self.QUERY = temp_query
            ind += 1
Esempio n. 8
0
from NLPCore import NLPCoreClient

text = ["Bill Gates works at Microsoft.", "Sergei works at Google."]

#path to corenlp
client = NLPCoreClient('/path/to/stanford-corenlp-full-2017-06-09')
properties = {
	"annotators": "tokenize,ssplit,pos,lemma,ner,parse,relation",
	"parse.model": "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",
	"ner.useSUTime": "0"
	}
doc = client.annotate(text=text, properties=properties)
print(doc.sentences[0].relations[0])
print(doc.tree_as_string())
Esempio n. 9
0
def makeQuery(apiKey, engineID, relation, threshold, query, k):
    if int(relation) == 1:
        relationName = "Live_In"
    elif int(relation) == 2:
        relationName = "Located_In"
    elif int(relation) == 3:
        relationName = "OrgBased_In"
    else:
        relationName = "Work_For"

    print("Parameters:")
    print("Client Key	= " + apiKey)
    print("Engine Key 	= " + engineID)
    print("Relation 	= " + relationName)
    print("Threshold 	= " + str(threshold))
    print("Query 		= " + query)
    print("# of tuples 	= " + str(k))
    iterationNum = 1
    goodTuples = 0
    extractedRelations = 0
    totalExtractedRelations = 0
    tuples = set()
    queries = set()
    while goodTuples < int(k):
        totalExtractedRelations = goodTuples
        service = build("customsearch", "v1", developerKey=apiKey)

        res = service.cse().list(
            q=query,
            cx=engineID,
        ).execute()
        print("=========== Iteration: " + str(iterationNum) + " - Query: " +
              query + " ===========")
        for i in range(10):
            extractedRelations = 0
            solution = res[u'items'][i][u'link'].encode('ascii', 'ignore')
            print("Processing: " + solution)
            try:
                r = urllib.urlopen(solution).read()
            except Exception as e:
                print(
                    "Program could not extract text content from this web site; moving to the next one..."
                )
                continue
            soup = BeautifulSoup(r)
            texts = soup.find_all(['h1', 'h2', 'h3', 'p'])
            result = []
            for text in texts:
                result.append(text.text.encode('ascii', 'ignore'))
            client = NLPCoreClient('stanford-corenlp-full-2017-06-09')
            properties = {
                "annotators":
                "tokenize,ssplit,pos,lemma,ner",  #Second pipeline; leave out parse,relation for first
                "parse.model":
                "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",  #Must be present for the second pipeline!
                "ner.useSUTime": "0"
            }
            properties2 = {
                "annotators":
                "tokenize,ssplit,pos,lemma,ner,parse,relation",  #Second pipeline; leave out parse,relation for first
                "parse.model":
                "edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz",  #Must be present for the second pipeline!
                "ner.useSUTime": "0"
            }
            doc = client.annotate(text=result, properties=properties)
            goodSentences = []
            for sen in doc.sentences:
                tok1 = False
                tok2 = False
                if int(relation) == 1:
                    for re in sen.tokens:
                        if re.ner == "PERSON":
                            tok1 = True
                        elif re.ner == "LOCATION":
                            tok2 = True
                elif int(relation) == 2:
                    for re in sen.tokens:
                        if re.ner == "LOCATION" and tok1 == False:
                            tok1 = True
                        elif tok1 == True and re.ner == "LOCATION":
                            tok2 = True
                elif int(relation) == 3:
                    for re in sen.tokens:
                        if re.ner == "ORGANIZATION":
                            tok1 = True
                        elif re.ner == "LOCATION":
                            tok2 = True
                else:
                    for re in sen.tokens:
                        if re.ner == "PERSON":
                            tok1 = True
                        elif re.ner == "ORGANIZATION":
                            tok2 = True
                if tok1 == True and tok2 == True:
                    goodSentences.append(sen)
            finalSentences = []
            for sentence in goodSentences:
                newsentence = ""
                for x in sentence.tokens:
                    newsentence += " " + x.word.encode('ascii', 'ignore')
                finalSentences.append(newsentence)
            doc2 = client.annotate(text=finalSentences, properties=properties2)
            list1 = []
            list2 = []
            for s1 in doc2.sentences:
                list1.append(s1)
            for s3 in list1:
                counterr = 0
                for s4 in s3.relations:
                    if counterr == 2:
                        break
                    counterr += 1
                    if relationValid(s4, relation, relationName):
                        print(
                            "=============== EXTRACTED RELATION ==============="
                        )
                        extractedRelations += 1
                        newsentence1 = ""
                        for x1 in s3.tokens:
                            newsentence1 += " " + x1.word
                        print("Sentence: " + newsentence1)
                        confidence = s4.probabilities[relationName]
                        enTy1 = s4.entities[0].type
                        enVa1 = s4.entities[0].value
                        enTy2 = s4.entities[1].type
                        enVa2 = s4.entities[1].value
                        print("RelationType: " + relationName +
                              " | Confidence= " + confidence +
                              " | EntityType1= " + enTy1 + " |")
                        print("EntityValue1= " + enVa1 + " | EntityType2= " +
                              enTy2 + " | EntityValue2= " + enVa2 + " |")
                        print(
                            "============== END OF RELATION DESC =============="
                        )
                        if (float(confidence) >= float(threshold)
                                and float(confidence) >= float(
                                    s4.probabilities["Live_In"])
                                and float(confidence) >= float(
                                    s4.probabilities["OrgBased_In"])
                                and float(confidence) >= float(
                                    s4.probabilities["Located_In"])
                                and float(confidence) >= float(
                                    s4.probabilities["Work_For"])):
                            tuples.add((relationName,
                                        round(float(confidence),
                                              3), enTy1, enVa1, enTy2, enVa2))
            totalExtractedRelations += extractedRelations
            print("Relations extracted from this website: " +
                  str(extractedRelations) + " (Overall: " +
                  str(totalExtractedRelations) + ")")
        print("Pruning relations below threshold...")
        goodTuples = len(tuples)
        print("Number of tuples after pruning: " + str(goodTuples))
        print("================== ALL RELATIONS =================")
        myTuples = list(tuples)
        myTuples.sort(key=operator.itemgetter(1), reverse=True)
        count = 0
        queries.add(query)
        for tup in myTuples:
            if (count == 0 and enVa2 + " " + enVa1 not in queries):
                query = enVa2 + " " + enVa1
                count = 1
            print("RelationType: " + tup[0] + "  | Confidence: " +
                  str(tup[1]) + "		| Entity #1= " + tup[3] + " (" + tup[2] +
                  ")	| Entity #2: " + tup[5] + " (" + tup[4] + ")")
        if (count == 0):
            print(
                "All possible queries have already been used! Breaking Program"
            )
            goodTuples = 1000
        iterationNum += 1
Esempio n. 10
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
from NLPCore import NLPCoreClient
from collections import defaultdict
import urllib.request as ur
import tika
import json
import sys
import re


SearchAPI = "https://www.googleapis.com/customsearch/v1"
NLPPackagePath = sys.argv[7]
client = NLPCoreClient(NLPPackagePath)
## Parameter for selection
DEBUG = False
USE_TIKA = True   # Default using BeautifulSoup, can choose TIKA

checkList = {"Work_For":"ORGANIZATION PEOPLE", "OrgBased_In": "LOCATION ORGANIZATION", "Live_In":"LOCATION PEOPLE","Located_In":"LOCATION LOCATION"}

def googleQuery(CSEKey, JsonAPIKey, query):
    payload = {'cx': CSEKey , 'key': JsonAPIKey ,'q' : query}
    r = requests.get(SearchAPI, params=payload)
    return json.loads(r.text)

def print_formatter(source):
    print("=============== EXTRACTED RELATION ===============")
    print("Sentence:",source['text'])
    builder = []