df = sum([unique_term in terms for terms in terms_by_doc_sets])
    #hint: iterate over 'terms_by_doc_sets' and test for the presence of 'unique_term' (you may use a list comprehension). You'll get a list of booleans. Sum it to get the counts
    # idf
    idf[unique_term] = math.log10(float(n_doc + 1) / df)
    if counter % 1e3 == 0:
        print(counter, "terms processed")

###########################################
# computing features for the training set #
###########################################

w = 3  # sliding window size

print("creating a graph-of-words for the collection")

c_g = terms_to_graph(terms_by_doc, w, False)

# sanity check (should return True)
print(len(all_unique_terms) == len(c_g.vs))

print("creating a graph-of-words for each training document")

all_graphs = []
for elt in terms_by_doc:
    all_graphs.append(terms_to_graph([elt], w, overspanning=True))

# sanity checks (should return True)
print(len(terms_by_doc) == len(all_graphs))
print(len(set(terms_by_doc[0])) == len(all_graphs[0].vs))

print("computing vector representations of each training document")
###########################################

w = 3 # sliding window size

print("creating a graph-of-words for the collection")

c_g = ### fill the gap ### hint: use the terms_to_graph function with the proper arguments

# sanity check (should return True)
print(len(all_unique_terms) == len(c_g.vs))

print("creating a graph-of-words for each training document")

all_graphs = []
for elt in terms_by_doc:
    all_graphs.append(terms_to_graph([elt],w,overspanning=True))

# sanity checks (should return True)
print(len(terms_by_doc)==len(all_graphs))
print(len(set(terms_by_doc[0]))==len(all_graphs[0].vs))

print("computing vector representations of each training document")

b = 0.003

features_degree = []
features_w_degree = []
features_closeness = []
features_w_closeness = []
features_twicw = [] # we try it only with unweighted degree
features_tfidf = []
Beispiel #3
0
    keywds_stemmed_unique = list(
        set(keywds_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywds_gold_standard.append(keywds_stemmed_unique)

    if counter % round(len(keywd_names) / 5) == 0:
        print(counter, 'files processed')

##############################
# precompute graphs-of-words #
##############################

### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ###
gs = []
for abs in abstracts_cleaned:
    gs.append(terms_to_graph(abs, 4))

##################################
# graph-based keyword extraction #
##################################

my_percentage = 0.33  # for PR and TF-IDF

method_names = ['kc', 'wkc', 'pr', 'tfidf']
keywords = dict(zip(method_names, [[], [], [], []]))

for counter, g in enumerate(gs):
    # k-core
    core_numbers = core_dec(g, False)
    ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ###
    max_c_n = max(core_numbers.values())
            ))  # remove duplicates (can happen due to n-gram breaking)
    keywords_gold_standard.append(keywords_stemmed_unique)

    # print progress
    if counter % round(len(keyword_names) / 10) == 0:
        print counter, 'files processed'

###############################
# keyword extraction with gow #
###############################

keywords_gow = []

for counter, abstract in enumerate(abstracts_cleaned):
    # create graph-of-words
    g = terms_to_graph(abstract, w=4)
    # decompose graph-of-words
    core_numbers = dict(zip(g.vs['name'], g.coreness()))
    # retain main core as keywords
    max_c_n = max(core_numbers.values())
    keywords = [kwd for kwd, c_n in core_numbers.iteritems() if c_n == max_c_n]
    # save results
    keywords_gow.append(keywords)

    # print progress
    if counter % round(len(abstracts_cleaned) / 10) == 0:
        print counter, 'abstracts processed'

keywords_gow_w = []

for counter, abstract in enumerate(abstracts_cleaned):
Beispiel #5
0
    keywords_stemmed = [stemmer.stem(keyword) for keyword in keywords]
    keywords_stemmed_unique = list(
        set(keywords_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywords_gold_standard.append(keywords_stemmed_unique)

    if counter % round(len(keyword_names) / 10) == 0:
        print(counter, 'files processed')

# In[5]:

##############################
# precompute graphs-of-words #
##################d############

gs = [terms_to_graph(abstracts, w=SWS) for abstracts in abstracts_cleaned]

# In[6]:

##################################
# keyword extraction with k-core #
##################################

keywords_kc = []

for counter, g in enumerate(gs):
    core_numbers = dict(zip(g.vs['name'],
                            g.coreness()))  # compute core numbers
    # retain main core as keywords
    max_c_n = max(core_numbers.values())
    keywords = [kwd for kwd, c_n in core_numbers.items() if c_n == max_c_n]
Beispiel #6
0
    idf[unique_term] = math.log10(
        (len(terms_by_doc) + 1) / df
    )  ### fill the gap ### hint: use math.log10 and refer to the beginning of Section 2 in the handout
    if counter % 1e3 == 0:
        print(counter, "terms processed")

###########################################
# computing features for the training set #
###########################################

w = 3  # sliding window size

print("creating a graph-of-words for the collection")

c_g = terms_to_graph(
    terms_by_doc, w, overspanning=True
)  ### fill the gap ### hint: use the terms_to_graph function with the proper arguments

# sanity check (should return True)
print(len(all_unique_terms) == len(c_g.vs))

print("creating a graph-of-words for each training document")

all_graphs = []
for elt in terms_by_doc:
    all_graphs.append(terms_to_graph([elt], w, overspanning=True))

# sanity checks (should return True)
print(len(terms_by_doc) == len(all_graphs))
print(len(set(terms_by_doc[0])) == len(all_graphs[0].vs))
Beispiel #7
0
stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')

my_doc = 'A method for solution of systems of linear algebraic equations \
with m-dimensional lambda matrices. A system of linear algebraic \
equations with m-dimensional lambda matrices is considered. \
The proposed method of searching for the solution of this system \
lies in reducing it to a numerical system of a special kind.'

my_doc = my_doc.replace('\n', '')

# pre-process document
my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct)

g = terms_to_graph(my_tokens, 4)

# number of edges
print("Number of edges :", len(g.es), "\n")

# the number of nodes should be equal to the number of unique terms
assert len(g.vs) == len(
    set(my_tokens
        )), 'The number of nodes should be equal to the number of unique terms'

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])
    keywds_stemmed = [stemmer.stem(keywd) for keywd in keywds]
    keywds_stemmed_unique = list(
        set(keywds_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywds_gold_standard.append(keywds_stemmed_unique)

    if counter % round(len(keywd_names) / 5) == 0:
        print(counter, 'files processed')

##############################
# precompute graphs-of-words #
##############################

gs = []
for abstract in abstracts_cleaned:
    gs.append(terms_to_graph(abstract, 4))

##################################
# graph-based keyword extraction #
##################################

my_percentage = 0.33  # for PR and TF-IDF

method_names = ['kc', 'wkc', 'pr', 'tfidf']
keywords = dict(zip(method_names, [[], [], [], []]))

for counter, g in enumerate(gs):
    # k-core
    kcore = core_dec(g, False)
    core_numbers = list(kcore.items())
Beispiel #9
0
        set(keywds_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywds_gold_standard.append(keywds_stemmed_unique)

    if counter % round(len(keywd_names) / 5) == 0:
        print(counter, 'files processed')

##############################
# precompute graphs-of-words #
##############################

### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ###
w = 4
gs = []
for ab in abstracts_cleaned:
    gs.append(terms_to_graph(ab, w))

##################################
# graph-based keyword extraction #
##################################

my_percentage = 0.33  # for PR and TF-IDF

method_names = ['kc', 'wkc', 'pr', 'tfidf']
keywords = dict(zip(method_names, [[], [], [], []]))

for counter, g in enumerate(gs):
    # k-core
    core_numbers = core_dec(g, False)
    ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ###
    max_c_n = max(core_numbers.values())
    ]  # remove stopwords (rare but may happen due to n-gram breaking)
    keywds_stemmed = [stemmer.stem(keywd) for keywd in keywds]
    keywds_stemmed_unique = list(
        set(keywds_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywds_gold_standard.append(keywds_stemmed_unique)

    if counter % round(len(keywd_names) / 5) == 0:
        print(counter, 'files processed')

##############################
# precompute graphs-of-words #
##############################

### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ###
gs = [terms_to_graph(abstract, 4) for abstract in abstracts_cleaned]

##################################
# graph-based keyword extraction #
##################################

my_percentage = 0.33  # for PR and TF-IDF

method_names = ['kc', 'wkc', 'pr', 'tfidf']
keywords = dict(zip(method_names, [[], [], [], []]))

for counter, g in enumerate(gs):
    # k-core
    core_numbers = core_dec(g, False)
    ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ###
    max_c_n = max(core_numbers.values())
    if counter % 1e3 == 0:
        print(counter, "terms processed")


# In[3]:


###########################################
# computing features for the training set #
###########################################

w = 3 # sliding window size

print("creating a graph-of-words for the collection")

c_g = terms_to_graph([all_unique_terms],w,overspanning=False)

# sanity check (should return True)
print(len(all_unique_terms) == len(c_g.vs))

print("creating a graph-of-words for each training document")

all_graphs = []
for elt in terms_by_doc:
    all_graphs.append(terms_to_graph([elt],w,overspanning=True))

# sanity checks (should return True)
print(len(terms_by_doc)==len(all_graphs))
print(len(set(terms_by_doc[0]))==len(all_graphs[0].vs))

print("computing vector representations of each training document")
Beispiel #12
0
    ]  # remove stopwords (rare but may happen due to n-gram breaking)
    keywords_stemmed = [stemmer.stem(keyword) for keyword in keywords]
    keywords_stemmed_unique = list(
        set(keywords_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywords_gold_standard.append(keywords_stemmed_unique)

    if counter % round(len(keyword_names) / 10) == 0:
        print(counter, 'files processed')

##############################
# precompute graphs-of-words #
##############################

gs = [
    terms_to_graph(abstract_cleaned, w=4)
    for abstract_cleaned in abstracts_cleaned
]  ### fill the gap ###

##################################
# keyword extraction with k-core #
##################################

keywords_kc = []

for counter, g in enumerate(gs):
    core_numbers = dict(zip(g.vs['name'],
                            g.coreness()))  # compute core numbers
    # retain main core as keywords
    max_c_n = max(core_numbers.values())
    keywords = [kwd for kwd, c_n in core_numbers.items() if c_n == max_c_n]
Beispiel #13
0
stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')

my_doc = 'A method for solution of systems of linear algebraic equations \
with m-dimensional lambda matrices. A system of linear algebraic \
equations with m-dimensional lambda matrices is considered. \
The proposed method of searching for the solution of this system \
lies in reducing it to a numerical system of a special kind.'

my_doc = my_doc.replace('\n', '')

# pre-process document
my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct)

g = terms_to_graph(my_tokens, 4)

# number of edges
print(len(g.es))

# the number of nodes should be equal to the number of unique terms
len(g.vs) == len(set(my_tokens))

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])

print(edge_weights)
Beispiel #14
0
stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')

my_doc = '''A method for solution of systems of linear algebraic equations 
with m-dimensional lambda matrices. A system of linear algebraic 
equations with m-dimensional lambda matrices is considered. 
The proposed method of searching for the solution of this system 
lies in reducing it to a numerical system of a special kind.'''

my_doc = my_doc.replace('\n', '')

# pre-process document
my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct)

g = terms_to_graph(my_tokens, w=4)

# number of edges
print(len(g.es))

# the number of nodes should be equal to the number of unique terms
assert len(g.vs) == len(set(my_tokens))

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])

print(edge_weights)
Beispiel #15
0
    keywds_stemmed_unique = list(
        set(keywds_stemmed
            ))  # remove duplicates (may happen due to n-gram breaking)
    keywds_gold_standard.append(keywds_stemmed_unique)

    if counter % round(len(keywd_names) / 5) == 0:
        print(counter, 'files processed')
#%%

##############################
# precompute graphs-of-words #
##############################

### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ###

gs = [terms_to_graph(toks, 4) for toks in abstracts_cleaned]

#%%
##################################
# graph-based keyword extraction #
##################################

my_percentage = 0.33  # for PR and TF-IDF

method_names = ['kc', 'wkc', 'pr', 'tfidf']
keywords = dict(zip(method_names, [[], [], [], []]))

for counter, g in enumerate(gs):
    # k-core
    core_numbers = core_dec(g, False)
    ### fill the gaps (retain main core as keywords and append the resulting list to 'keywords['kc']') ###
Beispiel #16
0
    if counter % round(len(keywd_names) / 5) == 0:
        print(counter, 'files processed')

##############################
# precompute graphs-of-words #
##############################

### fill the gap (use the terms_to_graph function, store the results in a list named 'gs') ###

gs = []
window_size = 4  #100

print('\n Building graphs with a window size of ', window_size)
for abstract in abstracts_cleaned:
    gs.append(terms_to_graph(abstract, window_size))

##################################
# graph-based keyword extraction #
##################################

print('\n -> Graph based keyword extraction \n')
my_percentage = 0.33  # for PR and TF-IDF

method_names = ['kc', 'wkc', 'pr', 'tfidf']
keywords = dict(zip(method_names, [[], [], [], []]))

for counter, g in enumerate(gs):
    # k-core
    core_numbers = core_dec(g, False)
    # core_numbers = dict(zip(g.vs['name'], g.coreness()))
Beispiel #17
0
stpwds = stopwords.words('english')
punct = string.punctuation.replace('-', '')

my_doc = 'A method for solution of systems of linear algebraic equations \
with m-dimensional lambda matrices. A system of linear algebraic \
equations with m-dimensional lambda matrices is considered. \
The proposed method of searching for the solution of this system \
lies in reducing it to a numerical system of a special kind.'

my_doc = my_doc.replace('\n', '')

# pre-process document
my_tokens = clean_text_simple(my_doc, my_stopwords=stpwds, punct=punct)

g = terms_to_graph(my_tokens, 4)

# number of edges
print(len(g.es))

# the number of nodes should be equal to the number of unique terms
len(g.vs) == len(set(my_tokens))

edge_weights = []
for edge in g.es:
    source = g.vs[edge.source]['name']
    target = g.vs[edge.target]['name']
    weight = edge['weight']
    edge_weights.append([source, target, weight])

print(edge_weights)