def getPersonNames(string):

    s = string.replace("\n", " . ")
    tagger = Ner(host='localhost', port=9199)
    taggedEng = tagger.get_entities(s)
    tagger = Ner(host='localhost', port=9198)
    taggedInd = tagger.get_entities(s)

    namesList = []
    name = None
    for word in taggedEng:
        if name != None and word[1] == "PERSON":
            name += " " + str(word[0])
        elif name == None and word[1] == "PERSON":
            name = str(word[0])
        elif name != None:
            namesList.append(name.lower().replace("\n", ""))
            name = None
    if name != None:
        namesList.append(name.lower().replace("\n", ""))

    name = None
    for word in taggedInd:
        if name != None and word[1] == "PERSON":
            name += " " + str(word[0])
        elif name == None and word[1] == "PERSON":
            name = str(word[0])
        elif name != None:
            namesList.append(name.lower().replace("\n", ""))
            name = None

    if name != None:
        namesList.append(name.lower().replace("\n", ""))

    n = len(namesList)
    i = 0
    while i < n:
        name = namesList[i]
        j = 0
        while j < n:
            if name in namesList[j] and i != j:
                namesList.pop(i)
                n -= 1
                i -= 1
                break
            j += 1
        i += 1

    return namesList
def extract_unknown_ner(sentences_df,
                        TEXT_COL='sentences',
                        NER_COL='named_entities',
                        ner_port=9199):
    ''' Extracted named entities using Stanford's NER.
        Requires a java server be already launched.

        sentences_df: pandas dataframe with one column that contains non-lowercased sentences
        TEXT_COL: name of column with sentences
        NER_COL: str name of column for output
    '''
    # To run this, you need to set up SNER local server
    # download stanford core nlp (should be a zip file of format stanford-ner-YYYY-MM-DD) (maybe from https://nlp.stanford.edu/software/CRF-NER.shtml#Download)
    # need to start the Java server:
    # cd C:\ProgramData\Anaconda3\Lib\site-packages\sner\stanford-ner-2018-02-27
    # java -Djava.ext.dirs=./lib -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port 9199 -loadClassifier ./classifiers/english.all.3class.distsim.crf.ser.gz  -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false

    # filter to sentences long enough to have sentiment and player name
    min_length = 10  # characters
    sentences_df = sentences_df[sentences_df[TEXT_COL].str.len() >= min_length]

    # tag using Java
    pos_tagger = Ner(host='localhost', port=ner_port)
    # would love to parallelize this, as it takes ~2-3 hours per year of data
    # ddf = dd.from_pandas(sentences_df)
    sner_entities = lambda text: [
        token for token, part in pos_tagger.get_entities(text)
        if part in {'PERSON', 'ORGANIZATION', 'LOCATION'}
    ]
    sentences_df[NER_COL] = sentences_df[TEXT_COL].apply(
        lambda doc: sner_entities(doc))

    return sentences_df
class StanfordNERClient(NER):
    """
    To run the server see:
        pythonmodules/ner/run_stanford.sh
    """
    def __init__(self, host=None, port=None):
        self.host = 'localhost' if host is None else host
        self.port = 9001 if port is None else port
        self.server = None

    def connect(self):
        self.server = Ner(host=self.host, port=self.port)

    def tag(self, text, **kwargs):
        # remove "untokenizable" characters to avoid warning from ner server
        text = bytes(text, 'utf-8').decode('utf-8', 'ignore')
        text = text.replace('\xFF\xFD', '')

        text = str(text).splitlines()
        if self.server is None:
            self.connect()
        try:
            return self._run(text)
        except ConnectionResetError:
            self.connect()
        except ConnectionRefusedError:
            raise ConnectionRefusedError(
                "Connection refused, is the server running at %s:%d? Check run_stanford.sh..."
                % (self.host, self.port))
        return self._run(text)

    def _run(self, text):
        return list(
            itertools.chain(*[self.server.get_entities(line)
                              for line in text]))
Exemple #4
0
class STF_NER:
    def __init__(self):
        self.tagger = Ner(host='localhost', port=8046)

    def get_ner(self, l):
        res = self.tagger.get_entities(l)
        return res
Exemple #5
0
def use_stanford_ner(data):
    entities = []
    helper = Helper
    tagger = Ner(host='localhost', port=9199)

    for row in data:
        sentences = helper.get_sentences(row['supposed_string'])
        ne_sentences = [
            tagger.get_entities(sentence) for sentence in sentences
        ]
        tagged_sentences = [
            helper.transform_stanford_name_entity_to_tree(sentence)
            for sentence in ne_sentences
        ]
        grammar = helper.get_grammar('stanford_ner')
        parsed_sentences = helper.get_parsed_sentences(grammar,
                                                       tagged_sentences)
        entities.extend(helper.extract_entities(parsed_sentences, row))
    return entities
def com_ner(data_type: str, rp: str):
    """
    Function gets the NER tags of the sentence using the sequential model pre-trained by Stanford NLP programs.

     :argument:
        :param data_type: String either `training` or `test`
        :param rp: Absolute path of the root directory of the project
    :return:
        boolean_flag: True for successful operation.
        all_ners: List of NER tags of each line
    """
    # initialize the tagger corresponding to StandfordNER server
    tagger = Ner(host="localhost", port=9199)
    all_ners = []
    read_flag, file = read_file("raw_sentence_{0}".format(data_type), rp)
    if read_flag:
        for line in file:
            word_tags = tagger.get_entities(line)
            ner_tags = [x[1] for x in word_tags]
            all_ners.append(" ".join(ner_tags))
        return True, all_ners
    else:
        return False
Exemple #7
0
cand_file = " "
# output
outputfile = " "
k = 0

with open(cand_file, "r") as input, open(outputfile, "w") as output:
    reader = csv.reader(input, delimiter="\t")
    writer = csv.writer(output, delimiter="\t", quoting=csv.QUOTE_ALL)
    for row in reader:
        articleid = row[0]
        source = row[1]
        sent = row[2]
        # using the server from stanford ner tagger
        tagger = Ner(host="localhost", port=9199)
        # tagging the candidate sentence
        tag = tagger.get_entities(sent)
        # check if all source words are tagged with "PERSON"
        for i in range(len(tag)):
            flag = False
            if tag[i][0] == first_pattern:
                source_person = []
                for j in range(1, min(6, len(tag) - i)):
                    if (tag[i + j][1] != "PERSON"
                            and tag[i + j][0] != second_pattern
                            and tag[i + j][0] != first_pattern):
                        break
                    elif tag[i + j][0] == first_pattern:
                        source_person = []
                        source_person = []
                    elif (tag[i + j][0] == second_pattern
                          and source_person != []
Exemple #8
0
def newsinfo(request):
    # nltk.download('vader_lexicon')
    context = {}
    form = StockForm(request.GET)
    form.is_valid()

    beta=0.8
    text = form.cleaned_data['news_input']
    text = re.sub(r"[^\w\s]", '', text) 

    st = Ner(host='localhost',port=9199)

    sid = SentimentIntensityAnalyzer()
    ss = sid.polarity_scores(text)
    sentimentResult=""
    for k in sorted(ss):
        sentimentResult+='{0}: {1}, '.format(k, ss[k])
    sentimentResult=sentimentResult[:-2]
    context['sentimentresult']=sentimentResult
    # first part - find pronoun
    start=time.time()
    tokens = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(tokens)
    entities = nltk.chunk.ne_chunk(tagged)
    wordcounts=Counter(tokens)
    organizationList=[]
    for tag in tagged:
        if(tag[1]=='NNP' and tag[0]!='' and tag[0] not in organizationList):
            organizationList.append(tag[0])

    #second part - stanford NER
    newOrganizationList=st.get_entities(text)
    for org in newOrganizationList:
        if org[1]=='ORGANIZATION' and org[0] not in organizationList: 
            organizationList.append(org[0])

    #third part - nltk NER
    organizationList=get_continuous_chunks(text,organizationList)

    for org in organizationList:
        print(org)
    #use google search to check the term whether is it an organization/corporation
    newOrganizationList=[]    
    for org in organizationList:
        if(checkOrganization(org)):
            newOrganizationList.append(org)

    # print(newOrganizationList)
    tickerList=[]
    filteredOrgList=[]
    wordCountList=[]
    #find ticker
    for orgName in newOrganizationList:
        ticker,tickerOrgName=findTicker(orgName)
        if(ticker!=None and ticker not in tickerList):
            print(orgName)
            print(wordcounts[orgName])
            wordCountList.append(wordcounts[orgName])
            filteredOrgList.append(tickerOrgName)
            tickerList.append(ticker)
    probDict={}
    totalWordCount=sum(wordCountList)
    for idx,org in enumerate(filteredOrgList):
        competitorName=[]
        marketCap=[]
        percentage=1.0/totalWordCount*wordCountList[idx]
        if(org not in probDict):
            probDict[org]=percentage*beta
        else:
            probDict[org]+=percentage*beta
        getCompetitorInfo(tickerList[idx],org,competitorName,marketCap)
        if(competitorName!=[]):
            marketCap.pop(0)
            totalMarketCap=sum(marketCap)
            for i,competitor in enumerate(competitorName):
                if(i<len(marketCap)):
                    if(competitor not in probDict):
                        probDict[competitor]=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i]
                    else:
                        probDict[competitor]+=percentage*(1-beta)*1.0/totalMarketCap*marketCap[i]

    print(probDict)
    labels=[]
    values=[]
    for key, value in probDict.iteritems():
        labels.append(key)
        values.append(value)
    if(len(labels)!=0):
        trace = graph_objs.Pie(labels=labels, values=values,textinfo='none')
        fig = Figure(data=Data([trace]))
        context['piechart']=plot(fig,auto_open=False,output_type='div')

    top10String=""
    sortList=numpy.argsort(values)[::-1]
    for idx,value in enumerate(sortList):
        if(idx==10):
            break
        top10String+=labels[value]+':'+str(values[value]*100)+'%,'

    top10String=top10String[:-1]
    context['top10']=top10String
    orgString=""
    for org in filteredOrgList:
        orgString+=org+','
    orgString=orgString[:-1]
    tickerString=""
    for ticker in tickerList:
        tickerString+=ticker+','
    tickerString=tickerString[:-1]
    context['foundentities']=orgString
    context['foundtickers']=tickerString
    context['result']=urllib2.unquote(text)
    print(context['result'])
    print(context['foundentities'])
    print(context['foundtickers'])
    return render(request, 'news/newsinfo.html', context)
Exemple #9
0
class landslide_location_extractor(utils.AssedMessageProcessor.AssedMessageProcessor):
    def __init__(self, debug=False):
        self.debug = debug
        self.time = time.time()
        pool = redis.ConnectionPool(host='localhost',port=6379, db=0)
        self.r=redis.Redis(connection_pool = pool)
        self.timecheck = 600
        self.locations = {}
        self.update_location_store()
        self.NER =  Ner(host="localhost", port=9199)
        self.counter = 0
        self.memory={}
        config = load_config("./config/assed_config.json")
        self.APIKEY = config["APIKEYS"]["googlemaps"]
        self.stream_tracker = {}

    def process(self,message):
        if message["streamtype"] not in self.stream_tracker:
            self.stream_tracker[message["streamtype"]] = {}
            self.stream_tracker[message["streamtype"]]["bad_location"] = 0
            self.stream_tracker[message["streamtype"]]["good_location"] = 0
            self.stream_tracker[message["streamtype"]]["totalcounter"] = 0
        if time.time() - self.time > self.timecheck:
            utils.helper_utils.std_flush("[%s] -- Updating news location store."%utils.helper_utils.readable_time())
            self.update_location_store()
            self.time = time.time()
            for _streamtype in self.stream_tracker:
                utils.helper_utils.std_flush("[%s] -- Processed %i elements from %s with %i good locations and %i bad locations"%(utils.helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"]))
                self.stream_tracker[_streamtype]["totalcounter"] = 0
                self.stream_tracker[_streamtype]["good_location"] = 0
                self.stream_tracker[_streamtype]["bad_location"] = 0
        if self.debug:
            utils.helper_utils.std_flush("Processed %i elements from %s with %i good locations and %i bad locations"%(self.stream_tracker[_streamtype]["totalcounter"],_streamtype, self.stream_tracker[_streamtype]["good_location"], self.stream_tracker[_streamtype]["bad_location"]))

        self.stream_tracker[message["streamtype"]]["totalcounter"] += 1
        # Check if location exists
        latitude = None
        longitude = None
        if "location" in message and message["location"] is not None and len(message["location"]) > 0:
            #already have a location
            pass
        else:
            # First location tagging to get locations...
            cleaned_message = str(message["text"].encode("utf-8"))[2:-2]
            cleaned_message = " ".join(nltk.tokenize.word_tokenize(cleaned_message))
            loc_tags = self.NER.get_entities(cleaned_message)
            desc_locations = self.extractLocations(loc_tags)
            locations = " ".join(desc_locations) if len(desc_locations) > 0 else None

            if locations is None:
                # Attempt match...
                for sublocations in self.locations:
                    if sublocations in cleaned_message:
                        locations = sublocations
                        latitude = self.locations[sublocations][0]
                        longitude = self.locations[sublocations][1]
                        break
            else:
                # This is number of location items...
                pass

                #utils.helper_utils.std_flush(self.counter)
                        
            if locations is None:
                self.stream_tracker[message["streamtype"]]["bad_location"] += 1
                return (False, message)

            # location is there, we will attempt geocoding right here... right now... right on this ship
            # With sublocations...
            if latitude is None or longitude is None:
                standardized_location = utils.helper_utils.location_standardize(locations)

                for sublocation in standardized_location.split(":"):
                    if sublocation in self.locations:
                        latitude = self.locations[sublocation][0]
                        longitude = self.locations[sublocation][1]
            
        
            message["location"] = locations
        
        # check if coords already in message
        if message["latitude"] is not None and message["longitude"] is not None:
            pass
        else:
            if latitude is not None and longitude is not None:
                message["latitude"] = str(latitude)
                message["longitude"] = str(longitude)
            else:
                # Attempt to get location from extractor memory (assed:extractor...)
                
                # First normalize...
                extractor_locations = utils.helper_utils.location_standardize(message["location"])
                # Then attempt retrieve
                coordinates = None
                for extractor_sublocation in extractor_locations.split(":"):
                    r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation)
                    coordinates = self.r.get(r_key)
                    if coordinates is not None:
                        latlng = coordinates.decode("utf-8").split(",")
                        latitude = float(latlng[0])
                        longitude = float(latlng[1])
                        break
                
                if coordinates is None:
                    # no sublocation exists. We are gonna have to geocode
                    utils.helper_utils.std_flush("[%s] -- Performing geolocation for %s using googlemaps"%(utils.helper_utils.readable_time(), message["location"]))
                    latitude = False
                    while latitude == False:
                        latitude,longitude = utils.helper_utils.lookup_address_only(message["location"], self.APIKEY, self.r)
                        if latitude == False:
                            warnings.warn("[%s] -- WARNING -- Maps API Expired for %s. Trying after 2 hours."%(utils.helper_utils.readable_time(), time.time()))
                            time.sleep(7200)
                    if latitude is not None and longitude is not None:
                        coordinates = str(latitude) + "," + str(longitude)
                        for extractor_sublocation in extractor_locations.split(":"):
                            r_key = utils.helper_utils.extractor_sublocation_key(extractor_sublocation)
                            # TODO ADD TO MEMORY AS WELL
                            self.r.set(r_key, coordinates, ex=259200)
                    
            if latitude is not None and longitude is not None:
                message["latitude"] = str(latitude)
                message["longitude"] = str(longitude)
            else:
                self.stream_tracker[message["streamtype"]]["bad_location"] += 1
                return (False, message)
        self.stream_tracker[message["streamtype"]]["good_location"] += 1
        return (True, message)
        


    def update_location_store(self,):
        self.locations = {}
        for _key in self.r.scan_iter(match="assed:sublocation:*", count=500):
            # keep only the first key location
            key_location = _key.decode("utf-8").split("assed:sublocation:")[1]
            if key_location.strip():
                key_coords = self.r.get(_key).decode("utf-8").split(",")
                latitude = float(key_coords[0])
                longitude = float(key_coords[1])
                self.locations[key_location] = (latitude, longitude)


    def extractLocations(self,temp_loc_tags):
        locations = []
        temp_loc=[]
        if temp_loc_tags[0][1] == 'LOCATION':
            temp_loc.append(temp_loc_tags[0][0])
        for entry in temp_loc_tags[1:]:
            if entry[1] == 'LOCATION':
                temp_loc.append(entry[0])
            else:
                if temp_loc:
                    locations.append(' '.join(temp_loc))
                    temp_loc=[]
        if temp_loc:
            locations.append(' '.join(temp_loc))
        return locations
Exemple #10
0
class News(multiprocessing.Process):
    def __init__(self, assed_config, root_name, errorQueue, messageQueue,
                 **kwargs):
        multiprocessing.Process.__init__(self)
        # set up DB connections
        self.DB_CONN = get_db_connection(assed_config)
        self.client = NewsApiClient(api_key="f715251d799140f793e63a1aec194920")
        self.root_name = root_name
        self.errorQueue = errorQueue
        self.messageQueue = messageQueue
        # No cached list because we are getting new stuff every day...
        self.config = kwargs["config"]
        self.NER = Ner(host='localhost', port=9199)
        pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
        self.r = redis.Redis(connection_pool=pool)
        pass

    def run(self, ):

        try:
            for event_topic in self.config["topic_names"]:
                if not self.config["topic_names"][event_topic][
                        "high_confidence"]["valid"]:
                    continue
                self.messageQueue.put("News downloader - working on %s" %
                                      event_topic)
                event_topic_key = str(
                    self.config["topic_names"][event_topic]["index"])
                self.cached_list = self.getCachedList(event_topic_key)
                stopwords = self.config["topic_names"][event_topic][
                    "stopwords"]
                keyword_set = self.config["topic_names"][event_topic][
                    "high_confidence"]["keywords"]
                articles = []
                for keyword in keyword_set:
                    try:
                        response = self.client.get_everything(q=keyword,
                                                              page_size=100)
                        articles += response["articles"]
                    except Exception as e:
                        self.messageQueue.put(
                            "NewsAPI for %s-%s failed with error: %s" %
                            (event_topic, keyword, repr(e)))

                article_content, article_location = self.getArticleDetails(
                    articles, stopwords)

                self.insertNews(article_content, event_topic_key)
                self.updateRedisLocations(article_location)

            self.DB_CONN.close()
            self.messageQueue.put(
                "Completed News download successfully at %s." %
                readable_time())

        except Exception as e:
            traceback.print_exc()
            self.errorQueue.put((self.root_name, str(e)))

    def getArticleDetails(self, articles, stopwords):
        article_content = []
        article_location = []
        exist_skip, stop_skip, location_skip, coordinate_skip = 0, 0, 0, 0
        for article in articles:
            item = {}
            item["id"] = base64.b64encode(str.encode(article["url"])).decode()
            if item["id"] in self.cached_list:
                exist_skip += 1
                continue
            item["source"] = article["source"]["name"]
            item["url"] = article["url"]
            item["time"] = dateutil.parser.parse(
                article["publishedAt"]).replace(
                    tzinfo=tz.gettz('UTC')).astimezone(
                        tz=tz.gettz('EDT')).strftime("%Y-%m-%d %H:%M:%S")
            item["title"] = article["title"]
            item["text"] = article["description"]

            #We are doing an extremely basic lookup <-- if it has landslide keyword, we accept.
            #Since this is alreadya landslide feed, google/whatever has better classifiers. We exploit those to create a super simple keyword filter.
            search_flag = False
            search_counter = 0
            rText = item["text"]
            if "content" in article and article["content"] is not None and len(
                    article["content"]) > 0:
                rText += article["content"]
            while not search_flag and search_counter < len(stopwords):
                if stopwords[search_counter] in rText:
                    search_flag = True
                search_counter += 1
            if search_flag:
                stop_skip += 1
                continue

            # Description based location
            temp_loc_tags = self.NER.get_entities(item["text"])
            desc_locations = self.extractLocations(temp_loc_tags)
            content_locations = []
            try:
                temp_loc_tags = self.NER.get_entities(" ".join(
                    nltk.tokenize.word_tokenize(article["content"])))
                content_locations = self.extractLocations(temp_loc_tags)
            except (TypeError, IndexError):
                # TypeError -- if content is empty in article. IndexError -- if content is not None, but still empty
                pass

            # create location set - take unique from both desc and content_location, after normalization...
            item["description_location"] = [
                location_normalize(item) for item in desc_locations
            ]
            item["content_location"] = [
                location_normalize(item) for item in content_locations
            ]

            final_locations = list(
                set(item["description_location"] + item["content_location"]))
            if len(final_locations) == 0:
                location_skip += 1
                continue
            item["locations"] = final_locations

            lat, lng = lookup_address_only(
                desc_locations, self.config["APIKEYS"]["googlemaps"], self.r)
            if lat == False:
                raise ValueError("Ran out of GoogleMaps daily keys")
            if lat is None or lng is None:
                coordinate_skip += 1
                continue
            item["latitude"] = lat
            item["longitude"] = lng
            item["cell"] = generate_cell(lat, lng)

            article_content.append(item)
            article_location.append({
                "name": final_locations,
                "lat": lat,
                "lng": lng
            })

        self.messageQueue.put(
            "Obtained News with: %i items and skipped \n\texisting %i items\n\tstopword %i items, \n\tmissing location %i items \n\tmissing coordinates %i items"
            % (len(article_content), exist_skip, stop_skip, location_skip,
               coordinate_skip))
        return article_content, article_location

    def extractLocations(self, temp_loc_tags):
        locations = []
        temp_loc = []
        if temp_loc_tags[0][1] == 'LOCATION':
            temp_loc.append(temp_loc_tags[0][0])
        for entry in temp_loc_tags[1:]:
            if entry[1] == 'LOCATION':
                temp_loc.append(entry[0])
            else:
                if temp_loc:
                    locations.append(' '.join(temp_loc))
                    temp_loc = []
        if temp_loc:
            locations.append(' '.join(temp_loc))
        return locations

    def convertDateFromTime(self, tm):
        '''
        Convert datetime to MySQL's datetime from time structure.
        '''
        return time.strftime("%Y-%m-%d %H:%M:%S", tm)

    def getCachedList(self, event_topic):
        cachedlist = set()
        cursor = self.DB_CONN.cursor()
        select = "SELECT item_id FROM HCS_News where timestamp > %s and topic_name = %s" % (
            (datetime.now() - timedelta(days=5)).strftime("%Y-%m-%d"),
            event_topic)
        cursor.execute(select)
        results = cursor.fetchall()
        cursor.close()
        for row in results:
            cachedlist.add(row[0])
        self.messageQueue.put("News cachedlist has  %i items in last 5 days" %
                              (len(cachedlist)))
        return cachedlist

    def insertNews(self, article_items, event_topic_key):
        event_topic_key = int(event_topic_key)
        cursor = self.DB_CONN.cursor()
        for item in article_items:

            insert = 'INSERT INTO HCS_News ( \
                        item_id, link, \
                        cell, latitude, longitude, timestamp, location, news_src, text, topic_name) \
                        VALUES (%s,%s,%s,%s,%s,%s,%s,%s, %s,%s)'
            params = (item['id'], item['url'], item['cell'], \
                    item['latitude'], item['longitude'], item['time'], ",".join(item["locations"]), item['source'], item['text'], event_topic_key)
            try:
                cursor.execute(insert, params)
                self.DB_CONN.commit()
            except Exception as e:
                traceback.print_exc()
                self.messageQueue.put('Failed to insert %s with error %s' %
                                      (item["id"], repr(e)))
        cursor.close()

    def updateRedisLocations(self, article_location):
        # get REDIS connection

        totalLocations = len(article_location)
        sublocations = 0
        for location in article_location:
            converted_location = " ".join(location["name"])
            location_std = location_standardize(converted_location)
            location_key = high_confidence_streamer_key("news:location:" +
                                                        location_std)
            self.r.set(location_key, converted_location, ex=259200)

            point_str = str(location["lat"]) + "," + str(location["lng"])
            for sublocation in location_std.split(":"):
                sublocationkey = sublocation_key(sublocation)
                self.r.set(sublocationkey, point_str, ex=259200)
                sublocations += 1
        self.messageQueue.put(
            "Completed News with: %i locations and %i sublocations" %
            (totalLocations, sublocations))
Exemple #11
0
         isdigit(
         ))
     for s in
     directors
 ]
 directors = [
     s.title()
     for s in
     directors
 ]
 # print(directors)
 cleaned = []
 peoples = []
 for element in directors:
     tags = st.get_entities(
         element
     )
     # print(tokens, tags)
     if all(tag[
             1]
            ==
            'PERSON'
            for
            tag
            in
            tags
            ):
         name = ' '.join(
             tag[0]
             for
             tag