Esempio n. 1
0
                          type=int)
    required.add_argument('--output', help="Output directory", required=True)

    args = parser.parse_args()
    input = args.input
    vec_size = args.size
    output = args.output

    if not input.endswith('/'):
        input = input + '/'
    if not output.endswith('/'):
        output = output + '/'
    if not os.path.exists(output):
        os.makedirs(output)

    sentences = get_sentences(input)

    tagged_data = [
        TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)])
        for i, _d in enumerate(sentences)
    ]

    max_epochs = 100
    alpha = 0.025

    model = Doc2Vec(size=vec_size,
                    alpha=alpha,
                    min_alpha=0.00025,
                    min_count=1,
                    dm=1)
Esempio n. 2
0
    sentence = args.sentence
    alpha = args.alpha
    eta = args.eta
    bigram = args.bigram
    max_df = args.max_df
    min_df = args.min_df

    if not input_dir.endswith('/'):
        input_dir = input_dir + '/'
    if not output_dir.endswith('/'):
        output_dir = output_dir + '/'

    if sentence:
        logging.info('Reading sentences of messages...')
        # Get sentences of messages
        messages = get_sentences(input_dir)
    else:
        logging.info('Reading messages...')
        # Get the whole messages
        messages = get_messages(input_dir)

    logging.info('Run lda with {} number of clusters...'.format(n_clusters))

    # Initialise the count vectorizer
    if bigram:
        count_vectorizer = CountVectorizer(analyzer='word',
                                           stop_words=STOP_WORDS,
                                           ngram_range=(1, 2),
                                           max_df=max_df,
                                           min_df=min_df)
    else:
Esempio n. 3
0
def index(word):
    return jsonify(get_sentences(word))
Esempio n. 4
0
    min_cl = args.min_cl
    max_cl = args.max_cl
    samples = args.samples
    keywords = args.keywords
    sentence = args.sentence
    vector_path = args.vector_path

    if not input.endswith('/'):
        input = input + '/'
    if not output.endswith('/'):
        output = output + '/'

    if sentence:
        logging.info('Reading sentences of emails...')
        # Get sentences of emails
        emails = get_sentences(input)
    else:
        logging.info('Reading emails...')
        # Get emails
        emails = get_emails(input)

    # Max number of clusters is always n_samples/2 if not specified
    if max_cl is None:
        max_cl = len(emails) // 2
    # Min number of clusters is greater than 1.
    if min_cl < 2:
        sys.exit('Minimum number of clusters should be greater than 1.')
    if max_cl > len(emails) - 1:
        sys.exit('Maximum number of clusters should be less than n_samples.')
    if min_cl > max_cl:
        sys.exit('Minumum number of clusters should be less than maximum')
Esempio n. 5
0
def get_feature_vector(feature_data):
  headline, content = feature_data
  print(headline)
  flat_content = ' '.join(content)
  content_sentences = [get_sentences(paragraph) for paragraph in content]

  qm_count_headline, em_count_headline, digit_count = counts(headline,0)
  qm_count_content, em_count_content = counts(flat_content,1)

  headline = headline.split()
  content = [paragraph.split() for paragraph in content]

  num_contractions_headline = num_contractions(headline)
  num_contractions_content = sum([num_contractions(paragraph) for paragraph in content])
  num_stopwords = sum([count_stopwords(paragraph) for paragraph in content])
  # print(content[0])

  headline = [removePunctuation(word) for word in headline]
  # content = [[word.replace('\'','') for word in paragraph] for paragraph  in content]
  content = [[removePunctuation(word) for word in paragraph] for paragraph in content]
  
  content_words = [word.split()[0] for paragraph in content for word in paragraph if word.split() != []]

  num_words = len(content_words)   
  num_tokens = token_count(' '.join(content_words))
  avg_length_words, longest_word = word_lengths(content_words,num_words)
  avg_length_sentences = num_words/len(content_sentences)
  starts_with_question_word = int(starts_with_q_word(headline))

  stopword_ratio = num_stopwords/num_words
  contraction_ratio = num_contractions_content/num_words

  headline = ' '.join(headline)
  POS_tags_headline = getPOSTags(headline)
  POS_tags_content = getPOSTags(' '.join(content_words))

  adverb = int(contains_adverb(POS_tags_headline))
  super_adj_count, super_adv_count = superlative_adj_adv_count(POS_tags_content)

  POS_counts =  posTagFeatures(POS_tags_content)

  BERT_keywords = ' '.join(get_key_words(content_words))
  document_sim = get_similarity(BERT_keywords,headline)
  sentence_sims = []
  paragraph_sims = []
  for paragraph in content:
    for sent in paragraph:
      sentence_sims.append(get_similarity(BERT_keywords,' '.join(sent)))
    paragraph_sims.append(get_similarity(BERT_keywords,' '.join([' '.join(sent) for sent in paragraph])))
  sentence_sims = sum(sentence_sims)/len(sentence_sims)
  paragraph_sims = sum(paragraph_sims)/len(paragraph_sims)
  
  vector = [qm_count_headline, em_count_headline, digit_count, qm_count_content, em_count_content,
            num_contractions_headline, num_contractions_content, num_stopwords, num_words, num_tokens,
            avg_length_words, longest_word, avg_length_sentences, starts_with_question_word, stopword_ratio,
            contraction_ratio, adverb, super_adj_count, super_adv_count, document_sim, sentence_sims,
            paragraph_sims]
  for item in POS_counts:
    vector.append(int(item))
  print(headline,' DONE')
  return (feature_data[0],vector)