def getMostRelevantEntity(searchString): try: """Detects entities in the text.""" client = language_v1beta2.LanguageServiceClient() # if isinstance(searchString, six.binary_type): text = searchString.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities for entity in entities: if entity_type[entity.type] == 'PERSON': return_entity = entity break result = { 'name': entity.name, 'salience': entity.salience, 'wikipedia_url': entity.metadata.get('wikipedia_url', '-') } return result except ValueError, e: return ''
def parse_file(resume): os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.path.join( os.path.dirname(__file__), 'Parsing-385521996355.json') print("WTF this was working yesterday", os.environ['GOOGLE_APPLICATION_CREDENTIALS']) credentials = ServiceAccountCredentials.from_json_keyfile_name( os.path.join(os.path.dirname(__file__), 'Parsing-385521996355.json'), scopes='https://www.googleapis.com/auth/cloud-language') #credentials = ServiceAccountCredentials.from_json_keyfile_name('/home/jared/Parsing-385521996355.json',scopes ='https://www.googleapis.com/auth/cloud-language' ) client = language_v1beta2.LanguageServiceClient() document = types.Document(content=resume, type=enums.Document.Type.PLAIN_TEXT) #categories = client.classify_text(document).categories ent = client.analyze_entities(document=document).entities #client = language.LanguageServiceClient() parsed = {} #execution of extracting name, email, phone number parsed['person'] = personel_information(resume) #extract major minor of undergrad need work on grad parsed['education'] = extract_School(resume, ent) #extract companies and work experience but needs a lot of work parsed['work'] = extract_company(resume, ent) #extracts skills really well but takes time parsed['skills'] = extract_all_skills(resume) return parsed
def classify(text, verbose=True): """Classify the input text into categories. """ language_client = language_v1beta2.LanguageServiceClient() document = types.Document( content=text, type=enums.Document.Type.PLAIN_TEXT) response = language_client.classify_text(document) categories = response.categories result = {} for category in categories: # Turn the categories into a dictionary of the form: # {category.name: category.confidence}, so that they can # be treated as a sparse vector. result[category.name] = category.confidence if verbose: print(text) for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('category', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence)) return result
def test_analyze_sentiment(self): client = language_v1beta2.LanguageServiceClient() content = "Hello, world!" type_ = enums.Document.Type.PLAIN_TEXT document = {"content": content, "type": type_} response = client.analyze_sentiment(document)
def test_analyze_entities(self, mock_create_stub): # Mock gRPC layer grpc_stub = mock.Mock() mock_create_stub.return_value = grpc_stub client = language_v1beta2.LanguageServiceClient() # Mock request document = {} # Mock response language = 'language-1613589672' expected_response = {'language': language} expected_response = language_service_pb2.AnalyzeEntitiesResponse( **expected_response) grpc_stub.AnalyzeEntities.return_value = expected_response response = client.analyze_entities(document) self.assertEqual(expected_response, response) grpc_stub.AnalyzeEntities.assert_called_once() args, kwargs = grpc_stub.AnalyzeEntities.call_args self.assertEqual(len(args), 2) self.assertEqual(len(kwargs), 1) self.assertIn('metadata', kwargs) actual_request = args[0] expected_request = language_service_pb2.AnalyzeEntitiesRequest( document=document) self.assertEqual(expected_request, actual_request)
def entity_sentiment_text(text): """Detects entity sentiment in the provided text.""" # [START beta_client] client = language_v1beta2.LanguageServiceClient() # [END beta_client] if isinstance(text, six.binary_type): text = text.decode('utf-8') document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT) # Pass in encoding type to get useful offsets in the response. encoding = enums.EncodingType.UTF32 if sys.maxunicode == 65535: encoding = enums.EncodingType.UTF16 result = client.analyze_entity_sentiment(document, encoding) for entity in result.entities: print('Mentions: ') print(u'Name: "{}"'.format(entity.name)) for mention in entity.mentions: print(u' Begin Offset : {}'.format(mention.text.begin_offset)) print(u' Content : {}'.format(mention.text.content)) print(u' Magnitude : {}'.format(mention.sentiment.magnitude)) print(u' Sentiment : {}'.format(mention.sentiment.score)) print(u' Type : {}'.format(mention.type)) print(u'Salience: {}'.format(entity.salience)) print(u'Sentiment: {}\n'.format(entity.sentiment))
def test_analyze_sentiment(self): # Setup Expected Response language = "language-1613589672" expected_response = {"language": language} expected_response = language_service_pb2.AnalyzeSentimentResponse( **expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = language_v1beta2.LanguageServiceClient() # Setup Request document = {} response = client.analyze_sentiment(document) assert expected_response == response assert len(channel.requests) == 1 expected_request = language_service_pb2.AnalyzeSentimentRequest( document=document) actual_request = channel.requests[0][1] assert expected_request == actual_request
def test_annotate_text(self): # Setup Expected Response language = 'language-1613589672' expected_response = {'language': language} expected_response = language_service_pb2.AnnotateTextResponse( **expected_response) # Mock the API response channel = ChannelStub(responses=[expected_response]) patch = mock.patch('google.api_core.grpc_helpers.create_channel') with patch as create_channel: create_channel.return_value = channel client = language_v1beta2.LanguageServiceClient() # Setup Request document = {} features = {} response = client.annotate_text(document, features) assert expected_response == response assert len(channel.requests) == 1 expected_request = language_service_pb2.AnnotateTextRequest( document=document, features=features) actual_request = channel.requests[0][1] assert expected_request == actual_request
def getTextTopic(searchString): try: """Classifies content categories of the provided text.""" client = language_v1beta2.LanguageServiceClient() document = types.Document(content=searchString, type=enums.Document.Type.PLAIN_TEXT) categories = client.classify_text(document).categories print categories # for category in categories: if not categories: return [] else: category = { 'name': categories[0].name, 'confidence': categories[0].confidence } return category # return category #print(u'=' * 20) #print(u'{:<16}: {}'.format('name', category.name)) #print(u'{:<16}: {}'.format('confidence', category.confidence)) except ValueError, e: return ''
def entity_sentiment_file(gcs_uri): """Detects entity sentiment in a Google Cloud Storage file.""" client = language_v1beta2.LanguageServiceClient() document = types.Document( gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Pass in encoding type to get useful offsets in the response. encoding = enums.EncodingType.UTF32 if sys.maxunicode == 65535: encoding = enums.EncodingType.UTF16 result = client.analyze_entity_sentiment(document, encoding) for entity in result.entities: print(u'Name: "{}"'.format(entity.name)) for mention in entity.mentions: print(u' Begin Offset : {}'.format(mention.text.begin_offset)) print(u' Content : {}'.format(mention.text.content)) print(u' Magnitude : {}'.format(mention.sentiment.magnitude)) print(u' Sentiment : {}'.format(mention.sentiment.score)) print(u' Type : {}'.format(mention.type)) print(u'Salience: {}'.format(entity.salience)) print(u'Sentiment: {}\n'.format(entity.sentiment))
def getCDID(rows): s = [] language_client = language_v1beta2.LanguageServiceClient() def getTag(content_input): document = types.Document(content=content_input, type=enums.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) return result if rows is not None: name = "/home/tnguyen/CREU/CREU/HedgeDetection/parse_data_articles/fulltext/" + rows[ 0] + '.txt' print(name) st = '' conf = '' print(name) with open(name, 'r') as myfile: if os.stat(name).st_size != 0: article = myfile.read() try: results = getTag(article) if results is not None: for result in results.categories: if result is not None: st = result.name print(st) s.append((rows[0], rows[1], rows[2], rows[3], rows[4], st)) except: print("too few words") else: s.append((rows[0], rows[1], rows[2], rows[3], rows[4], 0)) return s
def entities_text(text): """Detects entities in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities # entity types from enums.Entity.Type entity_type = ('UNKNOWN', 'PERSON', 'LOCATION', 'ORGANIZATION', 'EVENT', 'WORK_OF_ART', 'CONSUMER_GOOD', 'OTHER') for entity in entities: print('=' * 20) print(u'{:<16}: {}'.format('name', entity.name)) print(u'{:<16}: {}'.format('type', entity_type[entity.type])) print(u'{:<16}: {}'.format('metadata', entity.metadata)) print(u'{:<16}: {}'.format('salience', entity.salience)) print(u'{:<16}: {}'.format('wikipedia_url', entity.metadata.get('wikipedia_url', '-')))
def test_classify_text(self, mock_create_stub): # Mock gRPC layer grpc_stub = mock.Mock() mock_create_stub.return_value = grpc_stub client = language_v1beta2.LanguageServiceClient() # Mock request document = {} # Mock response expected_response = {} expected_response = language_service_pb2.ClassifyTextResponse( **expected_response) grpc_stub.ClassifyText.return_value = expected_response response = client.classify_text(document) self.assertEqual(expected_response, response) grpc_stub.ClassifyText.assert_called_once() args, kwargs = grpc_stub.ClassifyText.call_args self.assertEqual(len(args), 2) self.assertEqual(len(kwargs), 1) self.assertIn('metadata', kwargs) actual_request = args[0] expected_request = language_service_pb2.ClassifyTextRequest( document=document) self.assertEqual(expected_request, actual_request)
def doEntitiyAnalysis(searchString): try: """Detects entities in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(searchString, six.binary_type): text = searchString.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects entities in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML entities = client.analyze_entities(document).entities for entity in entities: print('=' * 20) print(u'{:<16}: {}'.format('name', entity.name)) print(u'{:<16}: {}'.format('type', entity_type[entity.type])) print(u'{:<16}: {}'.format('metadata', entity.metadata)) print(u'{:<16}: {}'.format('salience', entity.salience)) print(u'{:<16}: {}'.format( 'wikipedia_url', entity.metadata.get('wikipedia_url', '-'))) except ValueError, e: return ''
def test_analyze_sentiment_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) client = language_v1beta2.LanguageServiceClient(channel=channel) # Setup request document = {} with pytest.raises(CustomException): client.analyze_sentiment(document)
def syntax_file(gcs_uri): """Detects syntax in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens return tokens
def classify_text(text): """Classifies content categories of the provided text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') document = language_v1beta2.types.Document( content=text.encode('utf-8'), type=language_v1beta2.enums.Document.Type.HTML) categories = client.classify_text(document).categories return categories
def classify_file(gcs_uri): """Classifies the text in a Google Cloud Storage file.""" client = language_v1beta2.LanguageServiceClient() document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) categories = client.classify_text(document).categories for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('name', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence))
def test_analyze_sentiment_exception(self): # Mock the API response channel = ChannelStub(responses=[CustomException()]) patch = mock.patch("google.api_core.grpc_helpers.create_channel") with patch as create_channel: create_channel.return_value = channel client = language_v1beta2.LanguageServiceClient() # Setup request document = {} with pytest.raises(CustomException): client.analyze_sentiment(document)
def parse_text(text): client = language.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT) # Detect and send native Python encoding to receive correct word offsets. encoding = enums.EncodingType.UTF32 if sys.maxunicode == 65535: encoding = enums.EncodingType.UTF16 result = client.analyze_entity_sentiment(document, encoding) keywords = [] categories = [] for entity in result.entities: """print('Mentions: ') print(u'Name: "{}"'.format(entity.name)) for mention in entity.mentions: print(u' Begin Offset : {}'.format(mention.text.begin_offset)) print(u' Content : {}'.format(mention.text.content)) print(u' Magnitude : {}'.format(mention.sentiment.magnitude)) print(u' Sentiment : {}'.format(mention.sentiment.score)) print(u' Type : {}'.format(mention.type)) print(u'Salience: {}'.format(entity.salience)) print(u'Sentiment: {}\n'.format(entity.sentiment))""" for mention in entity.mentions: if mention.sentiment.score > 0 and entity.name not in keywords: keywords.append(entity.name.lower()) sections = text.strip().split("SEC.") language_client = language_v1beta2.LanguageServiceClient() for section in sections: subsections = section.strip().split(" (") for i in range(0, len(subsections)): subsection = subsections[i] if len(subsection) > 750: document = types2.Document( content=subsection.encode('utf-8'), type=enums2.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) for category in result.categories: categories.append(category.name) else: if i < len(subsections) - 1: subsections[i + 1] = subsections[i] + " " + subsections[i + 1] return keywords, categories
def get_topic(article): language_client = language_v1beta2.LanguageServiceClient() document = types_topic.Document(content=f"{article['cleaned_text']}", type=enums_topic.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) highest_confidence = [] for category in result.categories: highest_confidence.append({ 'category': category.name, 'confidence': category.confidence }) highest = max(highest_confidence, key=lambda x: x['confidence']) return filter_topic(highest['category'])
def classify(text): language_client = language_v1beta2.LanguageServiceClient() document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) result = language_client.classify_text(document) newsConfidence = None for category in result.categories: #print("Hi") #print(category.name) if "/News" in category.name: newsConfidence = category.confidence break return newsConfidence
def test_analyze_sentiment_exception(self, mock_create_stub): # Mock gRPC layer grpc_stub = mock.Mock() mock_create_stub.return_value = grpc_stub client = language_v1beta2.LanguageServiceClient() # Mock request document = {} # Mock exception response grpc_stub.AnalyzeSentiment.side_effect = CustomException() self.assertRaises(errors.GaxError, client.analyze_sentiment, document)
def sentiment_file(gcs_uri): """Detects sentiment in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects sentiment in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML sentiment = client.analyze_sentiment(document).document_sentiment print('Score: {}'.format(sentiment.score)) print('Magnitude: {}'.format(sentiment.magnitude))
def sample_analyze_syntax(): # Create a client client = language_v1beta2.LanguageServiceClient() # Initialize request argument(s) document = language_v1beta2.Document() document.content = "content_value" request = language_v1beta2.AnalyzeSyntaxRequest(document=document, ) # Make the request response = client.analyze_syntax(request=request) # Handle the response print(response)
def syntax_text(text): """Detects syntax in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens return tokens
def classify_text(text): """Classifies content categories of the provided text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') document = types.Document(content=text.encode('utf-8'), type=enums.Document.Type.PLAIN_TEXT) categories = client.classify_text(document).categories for category in categories: print(u'=' * 20) print(u'{:<16}: {}'.format('name', category.name)) print(u'{:<16}: {}'.format('confidence', category.confidence))
def sample_classify_text(): # Create a client client = language_v1beta2.LanguageServiceClient() # Initialize request argument(s) document = language_v1beta2.Document() document.content = "content_value" request = language_v1beta2.ClassifyTextRequest( document=document, ) # Make the request response = client.classify_text(request=request) # Handle the response print(response)
def sentiment_text(text): """Detects sentiment in the text.""" client = language_v1beta2.LanguageServiceClient() if isinstance(text, six.binary_type): text = text.decode('utf-8') # Instantiates a plain text document. document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT) # Detects sentiment in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML sentiment = client.analyze_sentiment(document).document_sentiment print('Score: {}'.format(sentiment.score)) print('Magnitude: {}'.format(sentiment.magnitude))
def syntax_file(gcs_uri): """Detects syntax in the file located in Google Cloud Storage.""" client = language_v1beta2.LanguageServiceClient() # Instantiates a plain text document. document = types.Document(gcs_content_uri=gcs_uri, type=enums.Document.Type.PLAIN_TEXT) # Detects syntax in the document. You can also analyze HTML with: # document.type == enums.Document.Type.HTML tokens = client.analyze_syntax(document).tokens # part-of-speech tags from enums.PartOfSpeech.Tag pos_tag = ('UNKNOWN', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'PUNCT', 'VERB', 'X', 'AFFIX') for token in tokens: print(u'{}: {}'.format(pos_tag[token.part_of_speech.tag], token.text.content))