def modelsConfig_qa(model): ## Question Answering: if model == "ELMo-BiDAF (Trained on SQuAD)": model_selected = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2020.03.19.tar.gz" ) elif model == "BiDAG (Trained on SQuAD)": model_selected = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-model-2020.03.19.tar.gz" ) elif model == "Transformer QA (Trained on SQuAD)": model_selected = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/transformer-qa-2020-05-26.tar.gz" ) elif model == "distilbert-base-cased-distilled-squad": model_selected = qa_pipeline("question-answering", model=f"{model}") elif model == "bert-large-uncased-whole-word-masking-finetuned-squad": model_selected = qa_pipeline("question-answering", model=f"{model}") # Multilingual: elif model == "mrm8488/bert-multi-cased-finetuned-xquadv1 [multilingual]": model = "mrm8488/bert-multi-cased-finetuned-xquadv1" model_selected = qa_pipeline("question-answering", model=f"{model}") else: raise Exception("Not a valid model") return model_selected
def __init__(self, ner=None, coref=None, dep=None): self.ner = ner if ner else Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz") self.coref = coref if coref else Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz") self.dep = dep if dep else Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/biaffine-dependency-parser-ptb-2018.08.23.tar.gz")
def load(self, name): if name == 'machine_comprehension' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "bidaf-model-2017.09.15-charpad.tar.gz")) if name == 'named_entity_recognition' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "ner-model-2018.12.18.tar.gz")) if name == 'textual_entailment' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "decomposable-attention-elmo-2018.02.19.tar.gz")) if name == 'coreference_resolution' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "coref-model-2018.02.05.tar.gz")) if name == 'semantic_role_labeling' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "srl-model-2018.05.25.tar.gz")) if name == 'constituency_parsing' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "elmo-constituency-parser-2018.03.14.tar.gz")) if name == 'dependency_parsing' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join( self.models_path, "biaffine-dependency-parser-ptb-2018.08.23.tar.gz")) if name == 'open_information_extraction' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "openie-model.2018-08-20.tar.gz")) if name == 'event2mind' and not self.is_loaded(name): self.models[name].model = Predictor.from_path( os.path.join(self.models_path, "event2mind-2018.09.17.tar.gz"))
def __init__( self, model_id: str, ): try: self.predictor = Predictor.from_path("hf://" + model_id) except (IOError, OSError): nltk = os.getenv("NLTK_DATA") if nltk is None: raise directory = os.path.join(nltk, "corpora") shutil.rmtree(directory) self.predictor = Predictor.from_path("hf://" + model_id)
def run( self, paragraphs: Dict[str, str], coref_model_path: str, batch_size=8 ) -> Dict[str, str]: """Run SRL extraction Args: paragraphs (Dict[str,str]): id: {paragraph} Returns: Dict[str, str]: return output id: paragraph with coreference resolution """ if torch.cuda.is_available(): logger.info("GPU found") logger.info("Initializing Coreference predictor with GPU") predictor = Predictor.from_path(coref_model_path, cuda_device=0) else: logger.info("Initializing Coreference predictor with CPU") predictor = Predictor.from_path(coref_model_path) logger.info(f"Batch_size = {batch_size}") batches = create_dict_chunks(paragraphs, batch_size) resolved_values = {} for batch in tqdm( batches, desc="Running coreference resolution", total=math.ceil(len(paragraphs) / batch_size), ): resolved_values = { **resolved_values, **{ id: val for id, val in zip( batch.keys(), predictor.predict_batch_json( [{"document": paragraph} for paragraph in batch.values()] ), ) }, } logger.success("Coreference resolution successful") logger.info("Resolving Coreference") results = {} for key, res in resolved_values.items(): new_paragraph = self.coref_sub(res["document"], res["clusters"]) results[key] = " ".join(new_paragraph) return results
def main(): if not (args.input and args.output): raise RuntimeError( 'You have to define both input and output files paths') with open(args.input) as json_file: coref_model = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz" ) entity_resol_model = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz" ) pos_tagging_model = spacy.load("en_core_web_lg") window = 4 # Coreference resolution on up to 5 previous queries data = json.load(json_file) out = "" # print(data) for p in data: # print(p) # desc = p['description'] current_conv = p['number'] # print("Description: " + desc + '\n') print("Current conv: " + str(current_conv) + '\n') # out = out + '\n' + 'Number: ' + str(current_conv) + '\n' + 'Description: ' + desc + '\n' u = [p['turn'][0]['raw_utterance']] out = out + str(current_conv) + '_1' + '\t' + u[0] + '\n' for t in p['turn'][1:]: current_turn = t['number'] if len(u) < window: u.append(t['raw_utterance']) else: u.pop(0) u.append(t['raw_utterance']) # print(str(current_conv) + '_' + str(current_turn) + ": " + current_utterance) print("Input: " + " ".join(u)) # indexes = indexer(u) resolved, indexes = coref_resol(coref_model, entity_resol_model, pos_tagging_model, u) u = update_utterances(resolved, indexes) print("Output: " + str(u)) print("---") out = out + str(current_conv) + '_' + str( current_turn) + '\t' + u[-1] + '\n' print('\n --- --- --- \n') print(out) output = open(args.output, "w") output.write(out) output.close()
def dependency_parsing(resolved_corefs, updated_bios): predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz", cuda_device=0 ) dps = [] logging.info("Performing dependency parsing...") for paragraph_index, evidences in tqdm(enumerate(resolved_corefs)): paragraph_bio = updated_bios[paragraph_index] dp = [] bio_index = 0 for ev_index, evidence in enumerate(evidences): pred = predictor.predict( sentence=evidence ) nodes = [] for i in range(len(pred['words'])): try: bio = paragraph_bio[bio_index + i] except IndexError: logging.warning("Mismatch between bio data length and paragraph length") bio = 0 nodes.append({ 'head': pred['predicted_heads'][i] - 1, 'pos': pred['pos'][i], 'dep': pred['predicted_dependencies'][i], 'word': pred['words'][i], 'ans': bio }) bio_index += len(pred['words']) dp.append(nodes) dps.append(dp) return dps
def __init__(self): self.predictor = AllenNLPPredictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-elmo-model-2020.03.19.tar.gz", \ cuda_device=torch.cuda.current_device() ) # print('file location={}'.format((os.path.dirname(os.path.abspath(__file__))))) filepath = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'data', '1000453_1997-11-14_CREDIT AGREEMENT.txt') # print('cwd={}'.format(filepath)) with open(filepath, 'r') as credit_doc: content = credit_doc.read() lines = content.split('\n') line_array = [] max_lines = 1000 i = 0 for line in lines: if len(line) > 0: line_array.append(line) i += 1 # if i > max_lines: # break self.corpus = '\n'.join(line_array)
def __init__(self, model_path=None, cuda_device=1): # model_path = model_path or LSTM_MODEL_PATH model_path = model_path or ROBERTA_MODEL_PATH self.predictor = Predictor.from_path(model_path, cuda_device=cuda_device) _tokenizer = PretrainedTransformerTokenizer( model_name="roberta-base", max_length=TRANSFORMER_WORDPIECE_LIMIT) class_name_mapper = {"0": "Negative", "1": "Positive"} _model = self.predictor._model _label_namespace = _model._label_namespace class_names = [ class_name_mapper[_model.vocab.get_index_to_token_vocabulary( _label_namespace).get(0)], class_name_mapper[_model.vocab.get_index_to_token_vocabulary( _label_namespace).get(1)] ] # reset the tokenizer to remove separators self.tokenizer = lambda s: [ t.text.replace("Ġ", "").replace('Ċ', '').replace('ĉ', "") for t in _tokenizer.tokenize(s) ][1:-1] self.explainer_lime = LimeTextExplainer( class_names=class_names, split_expression=self.tokenizer) self.explainer_integrate = IntegratedGradient(self.predictor) self.explainer_simple = SimpleGradient(self.predictor)
def perform_srl(responses, prompt=None): """ Perform semantic role labeling on a list of responses, given a prompt. Args: responses: a list of responses (or full sentences if prompt = None) prompt: the prompt that should be added as a prefix to the responses Returns: the output of the AllenNLP SRL model """ predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz" ) if prompt: sentences = [{ "sentence": prompt + " " + response } for response in responses] else: sentences = [{"sentence": response} for response in responses] output = predictor.predict_batch_json(sentences) full_output = [{ "sentence": prompt + " " + response if prompt else response, "response": response, "srl": srl } for (response, srl) in zip(responses, output)] return full_output
def pos_analyze(sentence_data): # pdb.set_trace() predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz") blank_documents = {} for i,document in tqdm(enumerate(sentence_data)): blank_sentence = {} for sentence in document: sentence = simplify(sentence,predictor) blank_sentence[sentence] = [] sentence_labels = predictor.predict(sentence=sentence) hierplane_tree = sentence_labels['hierplane_tree'] ADV_result = [] find_labels(hierplane_tree['root'],"RB","ADVP",ADV_result) # blank_sentence[sentence]["ADV"] = [] for item in ADV_result: new_sentence = sentence.replace(item,"[BLANK]") blank_sentence[sentence].append(tuple([new_sentence,item,"ADV"])) PP_result = [] find_labels(hierplane_tree['root'],"IGNORE","PP",PP_result) # blank_sentence[sentence]["PP"] = [] for item in PP_result: new_sentence = sentence.replace(item,"[BLANK]") blank_sentence[sentence].append(tuple([new_sentence,item,"PP"])) # print(blank_sentence[sentence]) blank_documents[i] = blank_sentence with open("sentences_with_blank.json",'w',encoding="utf8") as f: json.dump(blank_documents,f,indent=2,ensure_ascii = False) return blank_documents
def __init__(self, name: str, model_path: str = None, model_online_path: str = None, description: str = '', model_type: str = None) -> None: """A class specifically created for wrapping the predictors from Allennlp: https://allenai.github.io/allennlp-docs/api/allennlp.predictors.html Parameters ---------- name : str The name of the predictor. model_path : str, optional A local model path if you are using local models, by default None. This and ``model_online_path`` cannot both be None. model_online_path : str, optional An online model path, by default None description : str, optional A sentence describing the predictor., by default '' model_type : str, optional The model type as used in Allennlp, by default None Returns ------- None """ model = None if model_path: archive = load_archive(model_path) model = AllenPredictor.from_archive(archive, model_type) elif model_online_path: model = AllenPredictor.from_path(model_online_path, model_type) self.predictor = model Predictor.__init__(self, name, description, model, ['accuracy'])
def run_srl(ecb_path: str, data_loader: IDataLoader): documents = data_loader.read_data_from_corpus_folder(ecb_path) predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/srl-model-2018.05.25.tar.gz" ) sentences = Doc.to_sentences(documents) all_sentence_verbs = list() for sentence in sentences: srl_sentence = SRLSentence(sentence.doc_id, sentence.sent_id) sentence_word = sentence.get_sentence_words() prediction = predictor.predict_tokenized( tokenized_sentence=sentence_word) verbs = prediction['verbs'] words = prediction['words'] for verb in verbs: srl_verb = SRLVerb() tags = verb['tags'] srl_verb.add_var(tags, words) srl_sentence.add_srl_vrb(srl_verb) all_sentence_verbs.append(srl_sentence) print('Dont with sentence from doc-' + sentence.doc_id + ', withId-' + str(sentence.sent_id)) return all_sentence_verbs
def __init__(self, configuration): self.max_nodes = configuration["task"]["max_nodes"] self.max_query_size = configuration["task"]["max_query_size"] self.max_candidates = configuration["task"]["max_candidates"] dataset_location = configuration["task"]["dataset_folder"] in_file = dataset_location + "wikihop/train.json" with open(in_file, 'r') as f: self.raw_train_data = json.load(f) in_file = dataset_location + "wikihop/dev.json" with open(in_file, 'r') as f: self.raw_dev_data = json.load(f) in_file = dataset_location + "wikihop/dev.json" with open(in_file, 'r') as f: self.raw_test_data = json.load(f) self.nlp = English() self.glove_embedder = GloveEmbedder(configuration["preprocessing"]["glove_embeddings"]["file"]) # self.predictor = Predictor.from_path( # "https://allennlp.s3.amazonaws.com/models/coref-model-2020.02.10.tar.gz") # self.predictor = pretrained.load_predictor("coref") self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz") # self.predictor = pretrained.load_predictor("coref-spanbert") # self.predictor = Predictor.from_path( # "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-base-2020.02.27.tar.gz") self.evaluator = ChoiceEvaluator() AbstractTestProblem.__init__(self, configuration)
def _predictor_server(path, qi, qo): # Using os.fork() (called from multiprocessing) and allowing cleanups to be run like # normal is dangerous because some filesystem-related cleanups might be called # twice. That's why we remove them first, without executing them in this process. atexit._clear() from allennlp.predictors.predictor import Predictor from allennlp.predictors.semantic_role_labeler import SemanticRoleLabelerPredictor from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter # Use en_core_web_md for tokenizer instead # TODO: make optional SemanticRoleLabelerPredictor.__init__ = Predictor.__init__ predictor = Predictor.from_path(path) predictor._tokenizer = SpacyWordSplitter(language="en_core_web_md", pos_tags=True) while True: s = qi.get() if s is None: break qo.put_nowait(predictor.predict(s)) # We need to manually call atexit callbacks here because the multiprocessing module # doesn't call them: # https://stackoverflow.com/a/34507557/ # https://github.com/python/cpython/blob/49fd6dd887df6ea18dbb1a3c0f599239ccd1cb42/Lib/multiprocessing/popen_fork.py#L75 # But if we don't call them, allennlp leaves extracted archives in the $TMPDIR: # https://github.com/allenai/allennlp/blob/fefc439035df87e3d2484eb2f53ca921c4c2e2fe/allennlp/models/archival.py#L176-L178 logger.debug("atexit should call %d callbacks", atexit._ncallbacks()) atexit._run_exitfuncs()
def init(): actionList = [ "activate", "activates", "activated", "inhibit", "inhibits", "inhibited", "bind", "binds", "require", "requires", "required", "prevent", "prevents", "prevented" ] constPredictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/elmo-constituency-parser-2020.02.10.tar.gz" ) text = getInput('CS372_HW4_input_20170441.csv') outputFile = open('CS372_HW4_output_20170441.csv', 'w', newline='', encoding='utf-8-sig') wr = csv.writer(outputFile) wr.writerow( ["Marked", "Sentence", "Actual Triples", "System Triples", "Cited"]) for line in text: cited = line[0] sentence = line[1] triples = line[2].split(", ") marked = line[3] systemTriples = findSVO(sentence, actionList, constPredictor) if marked == "Training": wr.writerow( ["Training Sentence", sentence, triples, systemTriples, cited]) else: wr.writerow( ["Test Sentence", sentence, triples, systemTriples, cited]) outputFile.close()
def __init__(self, data_folder='data/src_data/rest/'): self.negative_words_list = [ 'doesn\'t', 'don\'t', 'didn\'t', 'no', 'did not', 'do not', 'does not', 'not yet', 'not', 'none', 'no one', 'nobody', 'nothing', 'neither', 'nowhere', 'never', 'hardly', 'scarcely', 'barely' ] self.negative_words_list = sorted(self.negative_words_list, key=lambda s: len(s), reverse=True) self.degree_word_list = [ 'absolutely', 'awfully', 'badly', 'barely', 'completely', 'decidedly', 'deeply', 'enormously', 'entirely', 'extremely', 'fairly', 'fully', 'greatly', 'highly', 'incredibly', 'indeed', 'very', 'really' ] text = self.read_text([ os.path.join(data_folder, 'train_sent.json'), os.path.join(data_folder, 'dev_sent.json'), os.path.join(data_folder, 'test_sent.json'), ]) self.word2idx = self.get_word2id(text) self.predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz" ) self.nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos', tokenize_pretokenized=True)
def entityRetrieval(query): predicts = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/ner-model-2018.12.18.tar.gz" ) results = predicts.predict_json({"sentence": query}) entity = [] wordList = [] flag = False for index, tag in enumerate(results['tags']): if len(str(tag)) > 1: wordList.append(process().lemmatize( results['words'][index].lower())) if str(tag).startswith('B-'): phrase = process().lemmatize(results['words'][index].lower()) flag = True elif flag: phrase += " " + process().lemmatize( results['words'][index].lower()) if str(tag).startswith('L-'): flag = False entity.append(phrase) else: entity.append(process().lemmatize( results['words'][index].lower())) return entity, wordList
def __init__(self): if SRLParser.__instance != None: raise Exception("This class is a Singleton!") else: self.logger = LoggerFactory(self).getLogger() self.predictor = Predictor.from_path("libs/allennlp-SRL.tar.gz") SRLParser.__instance = self
def get_predictors(): print("current device: {}".format(torch.cuda.current_device())) srl_predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bert-base-srl-2020.03.24.tar.gz", cuda_device=torch.cuda.current_device()) return srl_predictor
def extract_dependency_parse( data_file, out_file, dep_parser_path='/Users/liming/nltk_data/stanford-parser-full-2018-10-17/edu/stanford/nlp/models/parser/nndep/english_UD.gz' ): # Load the dependency parser dep_parser = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/biaffine-dependency-parser-ptb-2020.04.06.tar.gz" ) # stanford.StanfordDependencyParser(path_to_models_jar=dep_parser_path) with open(data_file, 'r') as f_in,\ open('{}.json'.format(out_file), 'w') as f_out: for ex, line in enumerate(f_in): # if ex > 30: # XXX # break print('\rExample {}'.format(ex), end='') data_dict = json.loads(line) sent = data_dict['tokens'] # Parse the sentence sent_len = len(sent) parsed_sent = dep_parser.predict(' '.join(sent)) predicted_labels = parsed_sent['predicted_dependencies'] predicted_heads = parsed_sent['predicted_heads'] dep_parse_dict = { 'predicted_dependencies': predicted_labels, 'predicted_heads': predicted_heads } # Save the parse info into the data dict data_dict['dep_parse'] = dep_parse_dict f_out.write('{}\n'.format(json.dumps(data_dict)))
def Bidaf(context, question): predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/bidaf-model-2020.03.19.tar.gz" ) answer = predictor.predict(passage=context, question=question)["best_span_str"] return answer
def main(): parser = argparse.ArgumentParser() parser.add_argument('ref_caption_file') parser.add_argument('out_file') parser.add_argument('--cuda_device', default=-1, type=int) opts = parser.parse_args() predictor = Predictor.from_path("https://s3-us-west-2.amazonaws.com/allennlp/models/bert-base-srl-2019.06.17.tar.gz", cuda_device=opts.cuda_device) ref_caps = json.load(open(opts.ref_caption_file)) uniq_sents = set() for key, sents in ref_caps.items(): for sent in sents: uniq_sents.add(sent) uniq_sents = list(uniq_sents) print('unique sents', len(uniq_sents)) outs = {} if os.path.exists(opts.out_file): outs = json.load(open(opts.out_file)) for i, sent in enumerate(uniq_sents): if sent in outs: continue try: out = predictor.predict_tokenized(sent.split()) except KeyboardInterrupt: break except: continue outs[sent] = out if i % 1000 == 0: print('finish %d / %d = %.2f%%' % (i, len(uniq_sents), i / len(uniq_sents) * 100)) with open(opts.out_file, 'w') as f: json.dump(outs, f)
def transformer_qna(context, question): predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/transformer-qa-2020-05-26.tar.gz" ) answer = predictor.predict(passage=context, question=question)["best_span_str"] return answer
def QA(text, question): path_to_model = "/data/bidaf-elmo-model-2020.03.19" with open(path_to_text, 'r') as file: text = file.read().replace('\n', '') predictor = Predictor.from_path(path_to_model) result = predictor.predict(text, question) return result['best_span_str']
def NAQANet(context, question): predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/naqanet-2020.02.19.tar.gz" ) answer = predictor.predict(passage=context, question=question)["answer"]["value"] return answer
def main(): args = parse_args() predictor = Predictor.from_path(args.model_path) data = [('Restaurants_Train_v2.xml', 'Restaurants_Test_Gold.xml'), ('Laptop_Train_v2.xml', 'Laptops_Test_Gold.xml')] for train_file, test_file in data: # xml -> txt xml2txt(os.path.join(args.data_path, train_file)) xml2txt(os.path.join(args.data_path, test_file)) # txt -> json train_sentences = get_dependencies( os.path.join(args.data_path, train_file.replace('.xml', '_text.txt')), predictor) test_sentences = get_dependencies( os.path.join(args.data_path, test_file.replace('.xml', '_text.txt')), predictor) print(len(train_sentences), len(test_sentences)) syntaxInfo2json(train_sentences, os.path.join(args.data_path, train_file)) syntaxInfo2json(test_sentences, os.path.join(args.data_path, test_file))
def loadCorefPredictor(): print("loading predictor...") predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/coref-model-2018.02.05.tar.gz" ) print("successfully loaded the predictor") return predictor
def main(): filename = "../data/result.tsv" predictor = Predictor.from_path( "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo-constituency-parser-2018.03.14.tar.gz" ) write(filename, predictor)
def extract_org_allenNLP(join_sentences, def_tag): predictor = Predictor.from_path( "https://storage.googleapis.com/allennlp-public-models/ner-model-2020.02.10.tar.gz" ) pre = predictor.predict(sentence=join_sentences) _persons_ = [''] * len(pre['tags']) pre_tag = pre['tags'] pre_words = pre['words'] for i in range(len(pre_tag)): tag = pre_tag[i] if def_tag in tag: _persons_[i] = pre_words[i] _persons_ = [list(g) for k, g in groupby(_persons_, key=bool) if k] names = [] for i in range(len(_persons_)): names.append(' '.join(_persons_[i])) # Remove duplicate names and considering the lone names as well temp = [] for i in range(len(names)): if names[i] not in temp: temp.append(names[i]) names = temp # Remove duplicate names irrespective of the case wordset = set(names) names = [ item for item in wordset if item.istitle() or item.title() not in wordset ] return names