def entities_text(text): """Detects entities in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities # entity types from enums.Entity.Type entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER') for entity in entities: print('=' * 20) print(u'{:<16}: {}'.format('name', entity.name)) print(u'{:<16}: {}'.format('type', entity_type[entity.type])) print(u'{:<16}: {}'.format('metadata', entity.metadata)) print(u'{:<16}: {}'.format('salience', entity.salience)) print(u'{:<16}: {}'.format('wikipedia_url', entity.metadata.get('wikipedia_url', '-')))
def entity_sentiment_text(text): """Detects entity sentiment in the provided text.""" # [START beta_client] client = language_v1beta2.LanguageServiceClient() # [END beta_client] if isinstance(text, six.binary_type): text = text.decode('utf-8') document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT) # Pass in encoding type to get useful offsets in the response. encoding = enums.EncodingType.UTF32 if sys.maxunicode == 65535: encoding = enums.EncodingType.UTF16 result = client.analyze_entity_sentiment(document, encoding) for entity in result.entities: print('Mentions: ') print(u'Name: "{}"'.format(entity.name)) for mention in entity.mentions: print(u' Begin Offset : {}'.format(mention.text.begin_offset)) print(u' Content : {}'.format(mention.text.content)) print(u' Magnitude : {}'.format(mention.sentiment.magnitude)) print(u' Sentiment : {}'.format(mention.sentiment.score)) print(u' Type : {}'.format(mention.type)) print(u'Salience: {}'.format(entity.salience)) print(u'Sentiment: {}\n'.format(entity.sentiment))
def getTextTopic(searchString): try: """Classifies content categories of the provided text.""" client = language_v1beta2.LanguageServiceClient() document = types.Document(content=searchString, type=enums.Document.Type.PLAIN_TEXT) categories = client.classify_text(document).categories print categories # for category in categories: if not categories: return [] else: category = { 'name': categories[0].name, 'confidence': categories[0].confidence } return category # return category #print(u'=' * 20) #print(u'{:<16}: {}'.format('name', category.name)) #print(u'{:<16}: {}'.format('confidence', category.confidence)) except ValueError, e: return ''
def entity_sentiment_file(gcs_uri): """Detects entity sentiment in a Google Cloud Storage file.""" client = language_v1beta2.LanguageServiceClient() document = types.Document( gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Pass in encoding type to get useful offsets in the response. encoding = enums.EncodingType.UTF32 if sys.maxunicode == 65535: encoding = enums.EncodingType.UTF16 result = client.analyze_entity_sentiment(document, encoding) for entity in result.entities: print(u'Name: "{}"'.format(entity.name)) for mention in entity.mentions: print(u' Begin Offset : {}'.format(mention.text.begin_offset)) print(u' Content : {}'.format(mention.text.content)) print(u' Magnitude : {}'.format(mention.sentiment.magnitude)) print(u' Sentiment : {}'.format(mention.sentiment.score)) print(u' Type : {}'.format(mention.type)) print(u'Salience: {}'.format(entity.salience)) print(u'Sentiment: {}\n'.format(entity.sentiment))
def doEntitiyAnalysis(searchString): try: """Detects entities in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(searchString, six.binary_type): text = searchString.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities for entity in entities: print('=' * 20) print(u'{:<16}: {}'.format('name', entity.name)) print(u'{:<16}: {}'.format('type', entity_type[entity.type])) print(u'{:<16}: {}'.format('metadata', entity.metadata)) print(u'{:<16}: {}'.format('salience', entity.salience)) print(u'{:<16}: {}'.format( 'wikipedia_url', entity.metadata.get('wikipedia_url', '-'))) except ValueError, e: return ''
def classify(text, verbose=True): """Classify the input text into categories. """ language_client = language_v1beta2.LanguageServiceClient() document = types.Document( content=text, type=enums.Document.Type.PLAIN_TEXT) response = language_client.classify_text(document) categories = response.categories result = {} for category in categories: # Turn the categories into a dictionary of the form: # {category.name: category.confidence}, so that they can # be treated as a sparse vector. result[category.name] = category.confidence if verbose: print(text) for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('category', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence)) return result
def getMostRelevantEntity(searchString): try: """Detects entities in the text.""" client = language_v1beta2.LanguageServiceClient() # if isinstance(searchString, six.binary_type): text = searchString.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities for entity in entities: if entity_type[entity.type] == 'PERSON': return_entity = entity break result = { 'name': entity.name, 'salience': entity.salience, 'wikipedia_url': entity.metadata.get('wikipedia_url', '-') } return result except ValueError, e: return ''
def syntax_file(gcs_uri): """Detects syntax in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens return tokens
def classify_file(gcs_uri): """Classifies the text in a Google Cloud Storage file.""" client = language_v1beta2.LanguageServiceClient() document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) categories = client.classify_text(document).categories for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('name', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence))
def parse_text(text): client = language.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT) # Detect and send native Python encoding to receive correct word offsets. encoding = enums.EncodingType.UTF32 if sys.maxunicode == 65535: encoding = enums.EncodingType.UTF16 result = client.analyze_entity_sentiment(document, encoding) keywords = [] categories = [] for entity in result.entities: """print('Mentions: ') print(u'Name: "{}"'.format(entity.name)) for mention in entity.mentions: print(u' Begin Offset : {}'.format(mention.text.begin_offset)) print(u' Content : {}'.format(mention.text.content)) print(u' Magnitude : {}'.format(mention.sentiment.magnitude)) print(u' Sentiment : {}'.format(mention.sentiment.score)) print(u' Type : {}'.format(mention.type)) print(u'Salience: {}'.format(entity.salience)) print(u'Sentiment: {}\n'.format(entity.sentiment))""" for mention in entity.mentions: if mention.sentiment.score > 0 and entity.name not in keywords: keywords.append(entity.name.lower()) sections = text.strip().split("SEC.") language_client = language_v1beta2.LanguageServiceClient() for section in sections: subsections = section.strip().split(" (") for i in range(0, len(subsections)): subsection = subsections[i] if len(subsection) > 750: document = types2.Document( content=subsection.encode('utf-8'), type=enums2.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) for category in result.categories: categories.append(category.name) else: if i < len(subsections) - 1: subsections[i + 1] = subsections[i] + " " + subsections[i + 1] return keywords, categories
def get_topic(article): language_client = language_v1beta2.LanguageServiceClient() document = types_topic.Document(content=f"{article['cleaned_text']}", type=enums_topic.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) highest_confidence = [] for category in result.categories: highest_confidence.append({ 'category': category.name, 'confidence': category.confidence }) highest = max(highest_confidence, key=lambda x: x['confidence']) return filter_topic(highest['category'])
def classify(text): language_client = language_v1beta2.LanguageServiceClient() document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) newsConfidence = None for category in result.categories: #print("Hi") #print(category.name) if "/News" in category.name: newsConfidence = category.confidence break return newsConfidence
def sentiment_file(gcs_uri): """Detects sentiment in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects sentiment in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML sentiment = client.analyze_sentiment(document).document_sentiment print('Score: {}'.format(sentiment.score)) print('Magnitude: {}'.format(sentiment.magnitude))
def syntax_text(text): """Detects syntax in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens return tokens
def classify_text(text): """Classifies content categories of the provided text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT) categories = client.classify_text(document).categories for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('name', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence))
def sentiment_text(text): """Detects sentiment in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects sentiment in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML sentiment = client.analyze_sentiment(document).document_sentiment print('Score: {}'.format(sentiment.score)) print('Magnitude: {}'.format(sentiment.magnitude))
def syntax_file(gcs_uri): """Detects syntax in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens # part-of-speech tags from enums.PartOfSpeech.Tag pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX') for token in tokens: print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag], token.text.content))
def run_quickstart(): # [START language_quickstart] # Imports the Google Cloud client library from google.cloud import language_v1beta2 from google.cloud.language_v1beta2 import enums from google.cloud.language_v1beta2 import types # Instantiates a client with the v1beta2 version client = language_v1beta2.LanguageServiceClient() # The text to analyze text = u'Hallo Welt!' document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT, language='de') # Detects the sentiment of the text sentiment = client.analyze_sentiment(document).document_sentiment print('Text: {}'.format(text)) print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude))
def doSentimentAnalysis(searchString): try: askstr = searchString.encode('utf-8') print askstr document = types.Document(content=askstr, type=enums.Document.Type.PLAIN_TEXT) # Instantiates a client client = language_v1beta2.LanguageServiceClient() sentiment = client.analyze_sentiment( document=document).document_sentiment #print('Text: {}'.format(text)) #print('Sentiment: {}, {}'.format(sentiment.score, sentiment.magnitude)) print str(sentiment.score) return ((str(sentiment.score), str(sentiment.magnitude))) except ValueError, e: return ''
def entities_file(gcs_uri): """Detects entities in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects sentiment in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities for entity in entities: print('=' * 20) print(u'{:<16}: {}'.format('name', entity.name)) print(u'{:<16}: {}'.format('type', entity.type)) print(u'{:<16}: {}'.format('metadata', entity.metadata)) print(u'{:<16}: {}'.format('salience', entity.salience)) print(u'{:<16}: {}'.format('wikipedia_url', entity.metadata.get('wikipedia_url', '-')))
def syntax_text(text): """Detects syntax in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens # part-of-speech tags from enums.PartOfSpeech.Tag pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX') for token in tokens: print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag], token.text.content))
def entities_file(gcs_uri): """Detects entities in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects sentiment in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities # entity types from enums.Entity.Type entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER') for entity in entities: print('=' * 20) print(u'{:<16}: {}'.format('name', entity.name)) print(u'{:<16}: {}'.format('type', entity_type[entity.type])) print(u'{:<16}: {}'.format('metadata', entity.metadata)) print(u'{:<16}: {}'.format('salience', entity.salience)) print(u'{:<16}: {}'.format('wikipedia_url', entity.metadata.get('wikipedia_url', '-')))
def get_bill(id, session): url = 'https://api.propublica.org/congress/v1/%s/bills/%s.json' % (session, id) headers = {'X-API-Key': 'gt6jsrJY8cXmh6WmRYwK0820BFfrtZlf25fJSKlo'} req = urllib.request.Request(url, None, headers) response = urllib.request.urlopen(req).read() billinfo = json.loads(response)['results'][0] chamber = "" if billinfo['bill_type'][0] == 'h': chamber = 'house' elif billinfo['bill_type'][0] == 's': chamber = 'senate' sponsor_funding_list = get_congressman(billinfo['sponsor'], chamber) cosponsors_funding_lists = {} cosponsor_url = 'https://api.propublica.org/congress/v1/%s/bills/%s/cosponsors.json' % ( session, id) cosponsor_headers = { 'X-API-Key': 'gt6jsrJY8cXmh6WmRYwK0820BFfrtZlf25fJSKlo' } cosponsor_req = urllib.request.Request(cosponsor_url, None, cosponsor_headers) cosponsor_response = urllib.request.urlopen(cosponsor_req).read() cosponsor_list = json.loads(cosponsor_response)['results'][0]['cosponsors'] for cosponsor in cosponsor_list: cosponsors_funding_lists[cosponsor['name']] = get_congressman( cosponsor["name"], chamber) funding_list = dict(cosponsors_funding_lists) funding_list[billinfo['sponsor']] = sponsor_funding_list if chamber == 'house': bill_url = 'https://www.gpo.gov/fdsys/pkg/BILLS-' + str(session)\ + str(id) + 'ih/html/BILLS-' + str(session) + str(id) + 'ih.htm' elif chamber == 'senate': bill_url = 'https://www.gpo.gov/fdsys/pkg/BILLS-' + str(session) \ + str(id) + 'is/html/BILLS-' + str(session) + str(id) + 'is.htm' bill_headers = {'User-Agent': 'Mozilla/5.0'} bill_req = urllib.request.Request(bill_url, None, bill_headers) bill_response = urllib.request.urlopen(bill_req).read().decode("utf-8") keywords, categories = parse_text(html2text.html2text(bill_response)) words_to_check = [] #hypernyms = [] for word in keywords: try: if include(word) and ' ' not in word: syn = nltk.corpus.wordnet.synsets(word) words_to_check = words_to_check + syn """paths = syn.hypernym_paths() for path in paths: hypernyms = hypernyms + path""" else: lst = word.strip().split(' ') for w in lst: if include(word): syn = nltk.corpus.wordnet.synsets(w) words_to_check = words_to_check + syn """paths = syn.hypernym_paths() for path in paths: hypernyms = hypernyms + path""" except (nltk.corpus.reader.wordnet.WordNetError): dummy = None syn_words = [] for synword in words_to_check: word = synword.name().strip().split(".")[0].replace("_", " ") if include(word): syn_words.append(word) words_to_check = set([word for word in keywords if include(word)] + syn_words) relevant_list = {} for sponsor in funding_list: sponsor_relevant_list = {} if funding_list[sponsor] != None and funding_list[sponsor][0] != None: for company in funding_list[sponsor][0]: try: wikipage = wptools.page(company.replace(" ", "_")) pagedata = wikipage.get_query().data['extext'] for word in words_to_check: if word in pagedata: sponsor_relevant_list[company] = funding_list[ sponsor][0][company] language_client = language_v1beta2.LanguageServiceClient() document = types2.Document( content=pagedata, type=enums2.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) for category in result.categories: flag = False for bill_category in categories: if category.name in bill_category or bill_category in category.name: flag = True if flag == True: sponsor_relevant_list[company] = funding_list[sponsor][ 0][company] except (LookupError): dummy = None relevant_list[sponsor] = sponsor_relevant_list print(json.dumps(relevant_list, indent=4, separators=(',', ': '))) return relevant_list
def getTag(content_input): document = types.Document(content=content_input, type=enums.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) return result
from google.cloud import language_v1beta2 from google.cloud.language_v1beta2 import enums from google.cloud.language_v1beta2 import types language_client = language_v1beta2.LanguageServiceClient() document = types.Document(content=''' Rafael Montero Shines in Mets Victory Over the Reds. Montero, who was demoted at midseason, took a one-hitter into the ninth inning as the Mets continued to dominate Cincinnati with a win at Great American Ball Park.''', type=enums.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) for category in result.categories: print('category name: ', category.name) print('category confidence: ', category.confidence, '\n')