type=int) required.add_argument('--output', help="Output directory", required=True) args = parser.parse_args() input = args.input vec_size = args.size output = args.output if not input.endswith('/'): input = input + '/' if not output.endswith('/'): output = output + '/' if not os.path.exists(output): os.makedirs(output) sentences = get_sentences(input) tagged_data = [ TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sentences) ] max_epochs = 100 alpha = 0.025 model = Doc2Vec(size=vec_size, alpha=alpha, min_alpha=0.00025, min_count=1, dm=1)
sentence = args.sentence alpha = args.alpha eta = args.eta bigram = args.bigram max_df = args.max_df min_df = args.min_df if not input_dir.endswith('/'): input_dir = input_dir + '/' if not output_dir.endswith('/'): output_dir = output_dir + '/' if sentence: logging.info('Reading sentences of messages...') # Get sentences of messages messages = get_sentences(input_dir) else: logging.info('Reading messages...') # Get the whole messages messages = get_messages(input_dir) logging.info('Run lda with {} number of clusters...'.format(n_clusters)) # Initialise the count vectorizer if bigram: count_vectorizer = CountVectorizer(analyzer='word', stop_words=STOP_WORDS, ngram_range=(1, 2), max_df=max_df, min_df=min_df) else:
def index(word): return jsonify(get_sentences(word))
min_cl = args.min_cl max_cl = args.max_cl samples = args.samples keywords = args.keywords sentence = args.sentence vector_path = args.vector_path if not input.endswith('/'): input = input + '/' if not output.endswith('/'): output = output + '/' if sentence: logging.info('Reading sentences of emails...') # Get sentences of emails emails = get_sentences(input) else: logging.info('Reading emails...') # Get emails emails = get_emails(input) # Max number of clusters is always n_samples/2 if not specified if max_cl is None: max_cl = len(emails) // 2 # Min number of clusters is greater than 1. if min_cl < 2: sys.exit('Minimum number of clusters should be greater than 1.') if max_cl > len(emails) - 1: sys.exit('Maximum number of clusters should be less than n_samples.') if min_cl > max_cl: sys.exit('Minumum number of clusters should be less than maximum')
def get_feature_vector(feature_data): headline, content = feature_data print(headline) flat_content = ' '.join(content) content_sentences = [get_sentences(paragraph) for paragraph in content] qm_count_headline, em_count_headline, digit_count = counts(headline,0) qm_count_content, em_count_content = counts(flat_content,1) headline = headline.split() content = [paragraph.split() for paragraph in content] num_contractions_headline = num_contractions(headline) num_contractions_content = sum([num_contractions(paragraph) for paragraph in content]) num_stopwords = sum([count_stopwords(paragraph) for paragraph in content]) # print(content[0]) headline = [removePunctuation(word) for word in headline] # content = [[word.replace('\'','') for word in paragraph] for paragraph in content] content = [[removePunctuation(word) for word in paragraph] for paragraph in content] content_words = [word.split()[0] for paragraph in content for word in paragraph if word.split() != []] num_words = len(content_words) num_tokens = token_count(' '.join(content_words)) avg_length_words, longest_word = word_lengths(content_words,num_words) avg_length_sentences = num_words/len(content_sentences) starts_with_question_word = int(starts_with_q_word(headline)) stopword_ratio = num_stopwords/num_words contraction_ratio = num_contractions_content/num_words headline = ' '.join(headline) POS_tags_headline = getPOSTags(headline) POS_tags_content = getPOSTags(' '.join(content_words)) adverb = int(contains_adverb(POS_tags_headline)) super_adj_count, super_adv_count = superlative_adj_adv_count(POS_tags_content) POS_counts = posTagFeatures(POS_tags_content) BERT_keywords = ' '.join(get_key_words(content_words)) document_sim = get_similarity(BERT_keywords,headline) sentence_sims = [] paragraph_sims = [] for paragraph in content: for sent in paragraph: sentence_sims.append(get_similarity(BERT_keywords,' '.join(sent))) paragraph_sims.append(get_similarity(BERT_keywords,' '.join([' '.join(sent) for sent in paragraph]))) sentence_sims = sum(sentence_sims)/len(sentence_sims) paragraph_sims = sum(paragraph_sims)/len(paragraph_sims) vector = [qm_count_headline, em_count_headline, digit_count, qm_count_content, em_count_content, num_contractions_headline, num_contractions_content, num_stopwords, num_words, num_tokens, avg_length_words, longest_word, avg_length_sentences, starts_with_question_word, stopword_ratio, contraction_ratio, adverb, super_adj_count, super_adv_count, document_sim, sentence_sims, paragraph_sims] for item in POS_counts: vector.append(int(item)) print(headline,' DONE') return (feature_data[0],vector)