def paper_title_NLP(title_corpus): # title_corpus is a list of tuple # keys like (19,1), means 2019/01 # value is a list of paper titles after tokenized # referece: https://stackoverflow.com/questions/36353125/nltk-regular-expression-tokenizer title_dict = {} pattern = r'''(?x) # set flag to allow verbose regexps (?:[A-Z]\.)+ # abbreviations, e.g. U.S.A. | \w+(?:-\w+)* # words with optional internal hyphens | \$?\d+(?:\.\d+)?%? # currency and percentages, e.g. $12.40, 82% | \.\.\. # ellipsis | [][.,;"'?():_`-] # these are separate tokens; includes ], [ ''' tokenizer = RegexpTokenizer(pattern) for t in title_corpus: key = (t[3], t[4]) if key in title_dict: filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) else: title_dict[key] = [] filterdText = tokenizer.tokenize(t[1]) title_dict[key].append(filterdText) # extract keywords with year span title_years = {} for k, v in title_dict.items(): key = (k[0], ) # year index if key in title_years.keys(): title_years[key].append(v) else: title_years[key] = [] title_years[key].append(v) deep_freq = [] for k, v in title_years.items(): fd = FreqDist() vs = [item for sublist in v for item in sublist] for v_ in vs: for word in v_: fd[word] += 1 print('The keywords for year:20{}'.format(str(k[0]))) print("Total number of words:{}".format(str( fd.N()))) # total number of samples print("Total number of unique words:{}".format(str( fd.B()))) # number of bins or unique samples fd.pprint(50) # The maximum number of items to display, default is 10 deep_freq.append(fd.freq('Deep') + fd.freq('deep')) print(deep_freq) plt.plot([2012, 2013, 2014, 2015, 2016, 2017, 2018], deep_freq) plt.ylabel('frequency of deep word') plt.xlabel('years') plt.show()